Repository: laminlabs/lamindb Branch: main Commit: 44563e03eeae Files: 288 Total size: 2.7 MB Directory structure: gitextract_xlz91t15/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ ├── enhancement.yml │ │ └── usage_question.yml │ └── workflows/ │ ├── build.yml │ └── doc-changes.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs/ │ ├── api.md │ ├── arrays.md │ ├── bionty.md │ ├── changelog.md │ ├── curate.md │ ├── faq/ │ │ ├── acid.md │ │ ├── curate-any.md │ │ ├── idempotency.md │ │ ├── import-modules.md │ │ ├── keep-artifacts-local.md │ │ ├── pydantic-pandera.md │ │ ├── reference-field.md │ │ ├── search.md │ │ ├── symbol-mapping.md │ │ ├── test_notebooks.py │ │ ├── track-run-inputs.md │ │ ├── trash-archive.md │ │ └── validate-fields.md │ ├── faq.md │ ├── guide.md │ ├── index.md │ ├── lightning.md │ ├── manage-changes.md │ ├── manage-ontologies.md │ ├── organize.md │ ├── pertdb.md │ ├── query-search.md │ ├── registries.md │ ├── scripts/ │ │ ├── curate_anndata_flexible.py │ │ ├── curate_anndata_uns.py │ │ ├── curate_dataframe_attrs.py │ │ ├── curate_dataframe_external_features.py │ │ ├── curate_dataframe_flexible.py │ │ ├── curate_dataframe_minimal_errors.py │ │ ├── curate_dataframe_union_features.py │ │ ├── curate_mudata.py │ │ ├── curate_soma_experiment.py │ │ ├── curate_spatialdata.py │ │ ├── define_schema_anndata_uns.py │ │ ├── define_schema_df_metadata.py │ │ ├── define_schema_spatialdata.py │ │ ├── my_workflow.py │ │ ├── my_workflow_with_click.py │ │ ├── my_workflow_with_step.py │ │ ├── run_script_with_step.py │ │ ├── run_track_and_finish.py │ │ ├── run_track_with_features_and_params.py │ │ ├── run_track_with_params.py │ │ └── synced_with_git.py │ ├── storage/ │ │ ├── add-replace-cache.ipynb │ │ ├── anndata-accessor.ipynb │ │ ├── prepare-sync-local-to-cloud.ipynb │ │ ├── sync-local-to-cloud.ipynb │ │ ├── test-files/ │ │ │ └── iris.data │ │ ├── test_notebooks.py │ │ ├── upload.ipynb │ │ └── vitessce.ipynb │ ├── storage.md │ ├── sync.md │ ├── test_notebooks.py │ └── track.md ├── lamindb/ │ ├── __init__.py │ ├── _finish.py │ ├── _secret_redaction.py │ ├── _view.py │ ├── base/ │ │ ├── __init__.py │ │ ├── dtypes.py │ │ ├── fields.py │ │ ├── ids.py │ │ ├── types.py │ │ ├── uids.py │ │ ├── users.py │ │ └── utils.py │ ├── core/ │ │ ├── __init__.py │ │ ├── _compat.py │ │ ├── _context.py │ │ ├── _functions.py │ │ ├── _mapped_collection.py │ │ ├── _settings.py │ │ ├── _sync_git.py │ │ ├── _track_environment.py │ │ ├── exceptions.py │ │ ├── loaders.py │ │ ├── storage/ │ │ │ ├── __init__.py │ │ │ ├── _anndata_accessor.py │ │ │ ├── _backed_access.py │ │ │ ├── _polars_lazy_df.py │ │ │ ├── _pyarrow_dataset.py │ │ │ ├── _spatialdata_accessor.py │ │ │ ├── _tiledbsoma.py │ │ │ ├── _valid_suffixes.py │ │ │ ├── _zarr.py │ │ │ ├── objects.py │ │ │ ├── paths.py │ │ │ └── types.py │ │ └── subsettings/ │ │ ├── __init__.py │ │ ├── _annotation_settings.py │ │ └── _creation_settings.py │ ├── curators/ │ │ ├── __init__.py │ │ └── core.py │ ├── errors.py │ ├── examples/ │ │ ├── __init__.py │ │ ├── cellxgene/ │ │ │ ├── __init__.py │ │ │ └── _cellxgene.py │ │ ├── croissant/ │ │ │ ├── __init__.py │ │ │ └── mini_immuno.anndata.zarr_metadata.json │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── _core.py │ │ │ ├── _fake.py │ │ │ ├── _small.py │ │ │ ├── define_mini_immuno_features_labels.py │ │ │ ├── define_mini_immuno_schema_flexible.py │ │ │ ├── mini_immuno.py │ │ │ └── save_mini_immuno_datasets.py │ │ ├── fixtures/ │ │ │ ├── __init__.py │ │ │ └── sheets.py │ │ ├── mlflow/ │ │ │ └── __init__.py │ │ ├── schemas/ │ │ │ ├── __init__.py │ │ │ ├── _anndata.py │ │ │ ├── _simple.py │ │ │ ├── define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py │ │ │ └── define_valid_features.py │ │ └── wandb/ │ │ └── __init__.py │ ├── integrations/ │ │ ├── __init__.py │ │ ├── _croissant.py │ │ ├── _vitessce.py │ │ └── lightning.py │ ├── migrations/ │ │ ├── 0177_squashed.py │ │ ├── 0178_v2_2.py │ │ ├── 0179_v2_2_part_2.py │ │ ├── 0180_v2_2_part_3.py │ │ ├── 0181_v2_2_part_4.py │ │ ├── 0182_v2_2_part_5.py │ │ ├── 0183_v2_2_part_6.py │ │ ├── 0184_alter_transformrecord_feature.py │ │ ├── 0185_alter_runrecord_feature.py │ │ ├── 0186_v2_4.py │ │ ├── 0187_squashed.py │ │ ├── 0187_v2_4_part_2.py │ │ ├── README.md │ │ └── __init__.py │ ├── models/ │ │ ├── __init__.py │ │ ├── _describe.py │ │ ├── _django.py │ │ ├── _feature_manager.py │ │ ├── _from_values.py │ │ ├── _is_versioned.py │ │ ├── _label_manager.py │ │ ├── _relations.py │ │ ├── _run_cleanup.py │ │ ├── artifact.py │ │ ├── artifact_set.py │ │ ├── block.py │ │ ├── can_curate.py │ │ ├── collection.py │ │ ├── feature.py │ │ ├── has_parents.py │ │ ├── project.py │ │ ├── query_manager.py │ │ ├── query_set.py │ │ ├── record.py │ │ ├── run.py │ │ ├── save.py │ │ ├── schema.py │ │ ├── sqlrecord.py │ │ ├── storage.py │ │ ├── transform.py │ │ └── ulabel.py │ ├── py.typed │ └── setup/ │ ├── __init__.py │ ├── _merge.py │ ├── _switch.py │ ├── core/ │ │ └── __init__.py │ ├── errors/ │ │ └── __init__.py │ └── types/ │ └── __init__.py ├── lamindb_full.py ├── noxfile.py ├── pyproject.full.toml ├── pyproject.toml ├── scripts/ │ └── migrate_test_instances.py └── tests/ ├── core/ │ ├── _dataset_fixtures.py │ ├── conftest.py │ ├── notebooks/ │ │ ├── basic-r-notebook.Rmd.cleaned.html │ │ ├── basic-r-notebook.Rmd.html │ │ ├── duplicate/ │ │ │ └── with-title-initialized-consecutive-finish.ipynb │ │ ├── load_schema.ipynb │ │ ├── no-title.ipynb │ │ ├── with-title-initialized-consecutive-finish-not-last-cell.ipynb │ │ └── with-title-initialized-consecutive-finish.ipynb │ ├── scripts/ │ │ ├── duplicate1/ │ │ │ └── script-to-test-versioning.py │ │ ├── duplicate2/ │ │ │ └── script-to-test-versioning.py │ │ ├── duplicate3/ │ │ │ └── script-to-test-versioning.py │ │ ├── duplicate4/ │ │ │ └── script-to-test-versioning.py │ │ ├── duplicate5/ │ │ │ └── script-to-test-versioning.py │ │ ├── script-to-test-filename-change.py │ │ └── script-to-test-versioning.py │ ├── test_artifact_anndata_with_curation.py │ ├── test_artifact_basics.py │ ├── test_artifact_dataframe_with_curation.py │ ├── test_artifact_describe_to_dataframe.py │ ├── test_artifact_features_annotations.py │ ├── test_artifact_parquet.py │ ├── test_blocks.py │ ├── test_branches.py │ ├── test_can_curate.py │ ├── test_collection.py │ ├── test_curator_basics.py │ ├── test_data_migrations.py │ ├── test_db.py │ ├── test_delete.py │ ├── test_feature.py │ ├── test_feature_dtype.py │ ├── test_from_values.py │ ├── test_has_parents.py │ ├── test_has_type.py │ ├── test_integrity.py │ ├── test_is_versioned.py │ ├── test_label_manager.py │ ├── test_load.py │ ├── test_manager.py │ ├── test_merge.py │ ├── test_nbconvert.py │ ├── test_notebooks.py │ ├── test_querydb.py │ ├── test_queryset.py │ ├── test_record_basics.py │ ├── test_record_sheet_examples.py │ ├── test_rename_features_labels.py │ ├── test_run.py │ ├── test_save.py │ ├── test_schema.py │ ├── test_search.py │ ├── test_settings.py │ ├── test_sqlrecord.py │ ├── test_storage.py │ ├── test_switch.py │ ├── test_track_flow.py │ ├── test_track_script_or_notebook.py │ ├── test_track_step.py │ ├── test_transform.py │ ├── test_transform_from_git.py │ └── test_view.py ├── curators/ │ ├── conftest.py │ ├── test_cellxgene_curation.py │ ├── test_curate_from_croissant.py │ ├── test_curators_examples.py │ ├── test_curators_remote.py │ └── test_dataframe_curation.py ├── integrations/ │ ├── conftest.py │ └── test_lightning.py ├── no_instance/ │ ├── conftest.py │ ├── test_connect_dynamic_import.py │ ├── test_import_side_effects.py │ └── test_no_default_instance.py ├── permissions/ │ ├── conftest.py │ ├── jwt_utils.py │ ├── scripts/ │ │ ├── check_lamin_dev.py │ │ ├── setup_access.py │ │ └── setup_instance.py │ └── test_rls_dbwritelog.py ├── profiling/ │ ├── import_lamindb.py │ ├── import_lamindb_and_connect.py │ ├── import_lamindb_core_storage.py │ └── import_records_from_dataframe.py ├── storage/ │ ├── conftest.py │ ├── test_artifact_storage.py │ ├── test_artifact_zarr.py │ ├── test_cache.py │ ├── test_connect_reconnect.py │ ├── test_storage_lifecycle.py │ ├── test_streaming.py │ └── test_transfer.py └── tiledbsoma/ ├── conftest.py ├── test_artifact_basics.py ├── test_curators.py └── test_storage.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Report a bug description: Report a bug. labels: - ":bug: bug" body: - type: textarea id: report attributes: label: Add a description placeholder: | Describe and consider providing version information. Please ensure you're on the latest version of lamindb. This is a public repository! Do not reveal any internal information. validations: required: true ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: true contact_links: - name: LaminHub issues url: https://github.com/laminlabs/laminhub-public about: If you have issues with the GUI/web app at lamin.ai, please report them here. - name: Enterprise support url: https://lamin.ai/contact about: If you have other questions, contact us directly. ================================================ FILE: .github/ISSUE_TEMPLATE/enhancement.yml ================================================ name: Propose an enhancement description: Propose an enhancement. body: - type: textarea id: description attributes: label: Add a description placeholder: | This is a public repository! Do not reveal any internal information. validations: required: true ================================================ FILE: .github/ISSUE_TEMPLATE/usage_question.yml ================================================ name: Ask a usage question description: Ask a usage question. labels: - "usage question" body: - type: textarea id: description attributes: label: Add a description placeholder: | This is a public repository! Do not reveal any internal information. validations: required: true ================================================ FILE: .github/workflows/build.yml ================================================ name: build on: push: branches: [release] pull_request: jobs: pre-filter: runs-on: ubuntu-latest outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - uses: dorny/paths-filter@v3 id: changes if: github.event_name != 'push' with: filters: | curator: - 'lamindb/curators/**' - 'lamindb/examples/cellxgene/**' - 'tests/curators/**' integrations: - 'lamindb/integrations/**' - 'tests/integrations/**' - id: set-matrix shell: bash run: | BASE_GROUPS=$(jq -n -c '["unit-core-sqlite", "unit-core-postgres", "unit-storage", "tutorial", "guide", "tiledbsoma", "biology", "faq", "storage", "cli", "permissions", "no-instance"]') ADDITIONAL_GROUPS=[] if [[ "${{ github.event_name }}" == "push" || "${{ github.event_name }}" == "repository_dispatch" ]]; then # Run everything on push and dispatch ADDITIONAL_GROUPS=$(jq -n -c '["curator", "integrations"]') else # Otherwise check which paths changed if [[ "${{ steps.changes.outputs.curator }}" == "true" ]]; then ADDITIONAL_GROUPS=$(jq -n -c --argjson groups "$ADDITIONAL_GROUPS" '$groups + ["curator"]') fi if [[ "${{ steps.changes.outputs.integrations }}" == "true" ]]; then ADDITIONAL_GROUPS=$(jq -n -c --argjson groups "$ADDITIONAL_GROUPS" '$groups + ["integrations"]') fi fi # Combine base groups with any additional groups MATRIX=$(jq -n -c --argjson base "$BASE_GROUPS" --argjson additional "$ADDITIONAL_GROUPS" '{group: ($base + $additional)}') # Output as single line for GitHub Actions echo "matrix=$(echo "$MATRIX" | jq -c .)" >> $GITHUB_OUTPUT # Pretty print for debugging echo "Generated matrix:" echo "$MATRIX" | jq . test: needs: pre-filter runs-on: ubuntu-latest strategy: fail-fast: false matrix: ${{fromJson(needs.pre-filter.outputs.matrix)}} timeout-minutes: 20 steps: - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 - uses: actions/checkout@v6 if: ${{ matrix.group == 'permissions' }} with: repository: laminlabs/laminhub token: ${{ secrets.GH_TOKEN_DEPLOY_LAMINAPP }} path: laminhub ref: main - uses: actions/setup-python@v6 with: python-version: | ${{ matrix.group == 'tiledbsoma' && '3.13' || matrix.group == 'permissions' && '3.14' || github.ref == 'refs/heads/release' && '3.11' || '3.14' }} - name: cache pre-commit uses: actions/cache@v4 with: path: ~/.cache/pre-commit key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} - name: cache postgres if: ${{ matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions'}} id: cache-postgres uses: actions/cache@v4 with: path: ~/postgres.tar key: cache-postgres-0 restore-keys: | cache-postgres- - name: cache postgres miss if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit != 'true' }} run: docker pull postgres:latest && docker image save postgres:latest --output ~/postgres.tar - name: cache postgres use if: ${{ (matrix.group == 'faq' || matrix.group == 'unit-core-postgres' || matrix.group == 'unit-storage' || matrix.group == 'permissions') && steps.cache-postgres.outputs.cache-hit == 'true' }} run: docker image load --input ~/postgres.tar - run: pip install "laminci@git+https://github.com/laminlabs/laminci" - run: nox -s configure_coverage -- '${{needs.pre-filter.outputs.matrix}}' - name: install postgres if: ${{ matrix.group == 'faq' }} run: sudo apt-get install libpq-dev - name: install graphviz if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'biology' || matrix.group == 'faq'}} run: sudo apt-get -y install graphviz # - run: nox -s lint # if: ${{ matrix.group == 'tutorial' }} - run: nox -s "install_ci(group='${{ matrix.group }}')" - uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - run: nox -s prepare if: ${{ !startsWith(matrix.group, 'unit-') && !startsWith(matrix.group, 'permissions') }} - run: nox -s "test(group='${{ matrix.group }}')" - name: upload coverage uses: actions/upload-artifact@v4 with: name: coverage--${{ matrix.group }} path: .coverage include-hidden-files: true - name: upload docs if: ${{ matrix.group == 'tutorial' || matrix.group == 'guide' || matrix.group == 'tiledbsoma' || matrix.group == 'biology' || matrix.group == 'faq' || matrix.group == 'storage' }} uses: actions/upload-artifact@v4 with: name: docs-${{ matrix.group }} path: ./docs/${{ matrix.group }} profile: runs-on: ubuntu-latest timeout-minutes: 10 env: LAMIN_API_KEY: ${{ secrets.LAMIN_API_KEY_TESTUSER1 }} steps: - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 - uses: actions/setup-python@v6 with: python-version: | ${{ github.ref == 'refs/heads/release' && '3.11' || '3.14' }} - run: pip install git+https://github.com/laminlabs/laminci - run: nox -s "install_ci(group='unit-core-sqlite')" - run: uv pip install --system git+https://github.com/laminlabs/laminprofiler - run: lamin login - run: laminprofiler check tests/profiling/import_lamindb_and_connect.py --threshold 3.5 - run: lamin connect laminlabs/lamindata - run: laminprofiler check tests/profiling/import_lamindb.py --threshold 1.5 - run: laminprofiler check tests/profiling/import_lamindb_core_storage.py --threshold 1.5 docs: needs: test runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 - name: checkout lndocs uses: actions/checkout@v6 with: repository: laminlabs/lndocs ssh-key: ${{ secrets.READ_LNDOCS }} path: lndocs ref: main - uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: us-east-1 - uses: actions/setup-python@v6 with: python-version: "3.12" - run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci" - run: nox -s "install_ci(group='docs')" - uses: actions/download-artifact@v4 - run: nox -s clidocs - run: nox -s prepare - run: nox -s docs - run: rm -r ./_build/html/.doctrees # do not want to deploy with cloudflare - uses: cloudflare/wrangler-action@v3 id: cloudflare with: apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} accountId: 472bdad691b4483dea759eadb37110bd command: pages deploy "_build/html" --project-name=lamindb gitHubToken: ${{ secrets.GITHUB_TOKEN }} - uses: edumserrano/find-create-or-update-comment@v2 if: github.event_name == 'pull_request' with: issue-number: ${{ github.event.pull_request.number }} body-includes: "Deployment URL" comment-author: "github-actions[bot]" body: | Deployment URL: ${{ steps.cloudflare.outputs.deployment-url }} edit-mode: replace - uses: peter-evans/repository-dispatch@v2 if: ${{ github.event_name == 'push' }} with: token: ${{ secrets.LAMIN_BUILD_DOCS }} repository: "laminlabs/lamin-docs" event-type: build coverage: needs: test runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: "3.14" - run: | python -m pip install -U uv uv pip install --system coverage[toml] uv pip install --system --no-deps . - uses: actions/download-artifact@v4 - name: run coverage run: | coverage combine coverage--*/.coverage* coverage report --fail-under=0 coverage xml - uses: codecov/codecov-action@v2 with: token: ${{ secrets.CODECOV_TOKEN }} dispatch: if: ${{ github.event_name == 'push' }} runs-on: ubuntu-latest steps: - uses: peter-evans/repository-dispatch@v2 with: token: ${{ secrets.LAMIN_BUILD_DOCS }} repository: "laminlabs/lamindb-dispatch" event-type: build ================================================ FILE: .github/workflows/doc-changes.yml ================================================ name: doc-changes on: pull_request_target: branches: - main - release types: - closed jobs: doc-changes: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" - run: pip install "laminci[doc-changes]@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci" - run: laminci doc-changes env: repo_token: ${{ secrets.GITHUB_TOKEN }} docs_token: ${{ secrets.LAMIN_BUILD_DOCS }} changelog_file: lamin-docs/docs/changelog/soon/lamindb.md ================================================ FILE: .gitignore ================================================ __MACOSX/ # LaminDB README_stripped.md docs/scripts/test_artifact_parquet.py README.ipynb docs/sample.fasta docs/faq/sample.fasta docs/faq/test-acid/ docs/scripts/define_mini_immuno_features_labels.py docs/scripts/define_mini_immuno_schema_flexible.py docs/scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py docs/scripts/define_valid_features.py docs/scripts/save_mini_immuno_datasets.py profile_output* docs/cli.md .coveragerc *.db *.lndb *.jpg *.zarr/ docsbuild/ docs/lamin.md docs/guide/data-validation.ipynb docs/guide/bionty.ipynb docs/guide/lnschema-core.ipynb docs/paradisi05_laminopathic_nuclei.jpg bionty_docs/ lamindb_docs/ _build mydata/ lamin-intro/ lamin-tutorial/ mytest/ rds/ mydb/ docs/test-registries/ docs/test-annotate-flexible/ docs/lamindb.* lamin_sphinx docs/conf.py lamindb/setup/.env _secrets.py _configuration.py lamin.db docs/generated/* _docs_tmp* docs/guide/Laminopathic_nuclei.jpg docs/guide/paradisi05_laminopathic_nuclei.jpg nocodb docs/guide/SRR4238351_subsamp.fastq.gz docs/faq/paradisi05_laminopathic_nuclei.jpg docs/faq/tostore/ docs/faq/mydata_postgres/ docs/guide/myobjects/ docs/faq/test-run-inputs/ docs/intro/paradisi05_laminopathic_nuclei.jpg docs/guide/figures/ docs/test-annotate/ docs/test-track/ suo22/ docs/biology/test-flow/ docs/biology/test-scrna/ docs/biology/test-registries/ docs/biology/test-multimodal/ default_storage default_storage_unit_core default_storage_unit_storage test.ipynb test2.ipynb run-tests test-django-validation/ curate.tiledbsoma small_dataset.tiledbsoma nonregistered_storage registered_storage tests/core/notebooks/no-uid-renamed.ipynb # General .DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # ruff .ruff_cache # Pyre type checker .pyre/ # data files data/ _build *.csv *.fcs *.zip *.feather *.h5ad *.h5mu *.parquet *.bam *.fastq.gz *.pt # Pycharm .idea # VSCode .vscode # CELLxGENE !lamindb/examples/cellxgene/cellxgene_schema_versions.csv # ml lightning_logs mlruns download_mnist checkpoints test_lightning ================================================ FILE: .gitmodules ================================================ [submodule "sub/lamindb-setup"] path = sub/lamindb-setup url = https://github.com/laminlabs/lamindb-setup [submodule "sub/lamin-cli"] path = sub/lamin-cli url = https://github.com/laminlabs/lamin-cli [submodule "sub/bionty"] path = sub/bionty url = https://github.com/laminlabs/bionty [submodule "sub/pertdb"] path = sub/pertdb url = https://github.com/laminlabs/pertdb [submodule "sub/cellxgene-lamin"] path = sub/cellxgene-lamin url = https://github.com/laminlabs/cellxgene-lamin.git ================================================ FILE: .pre-commit-config.yaml ================================================ fail_fast: false default_language_version: python: python3 default_stages: - pre-commit - pre-push minimum_pre_commit_version: 2.16.0 repos: - repo: https://github.com/rbubley/mirrors-prettier rev: v3.5.1 hooks: - id: prettier exclude: | (?x)( docs/changelog.md|.github/ISSUE_TEMPLATE/config.yml|tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html|README.md ) - repo: https://github.com/kynan/nbstripout rev: 0.8.1 hooks: - id: nbstripout exclude: | (?x)( docs/examples/| docs/notes/ ) - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.10 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes] - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: - id: detect-private-key - id: check-ast - id: end-of-file-fixer exclude: | (?x)( .github/workflows/latest-changes.jinja2 ) - id: mixed-line-ending args: [--fix=lf] - id: trailing-whitespace exclude: | (?x)( tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html ) - id: check-case-conflict - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.14.1 hooks: - id: mypy args: [ --no-strict-optional, --ignore-missing-imports, --disable-error-code=annotation-unchecked, --disable-error-code=type-arg, --namespace-packages, --explicit-package-bases, ] additional_dependencies: ["types-requests", "types-attrs", "types-PyYAML"] exclude: | (?x)( test_notebooks.py| script-to-test-versioning.py| tests/storage/conftest.py| tests/curators/conftest.py| tests/permissions/conftest.py| tests/writelog/conftest.py| tests/writelog_sqlite/conftest.py| tests/curators/test_curators_examples.py| tests/core/conftest.py| docs/scripts/ ) ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Contributions are generally welcome. Please make an issue to discuss proposals. ## Installation ### PyPI For installation from PyPI, see [docs.lamin.ai/setup](https://docs.lamin.ai/setup). ### Github For installation from GitHub, call: ```bash git clone --recursive https://github.com/laminlabs/lamindb pip install laminci python -m venv .venv source .venv/bin/activate nox -s install ``` This will install a few dependencies from the git submodules linked [here](https://github.com/laminlabs/lamindb/tree/main/sub), as well as packages like `pytest` and `pre-commit` that you'll need when developing. lamindb depends on several other packages that may require modifications for pull requests to successfully pass the continuous integration build. We suggest the following workflow if commits to any of the submodules are essential for the current modifications in lamindb: 1. Change directory into the submodule that you want to modify: `cd sub/SUBMODULE`. 2. Switch to a new feature branch: `git switch -c feature/NEWFEATURE`. 3. Make a pull request with your changes to the `SUBMODULE` and ensure that the CI passes. 4. In the repository root of lamindb, create a new commit and push: ```bash cd .. git add -u git commit -m "Upgraded SUBMODULE" git push ``` Any pull request of yours should now also have the changes of the submodule included allowing you to test that changes in the submodule and lamindb are compatible. ## Running and writing tests This package uses the [pytest][] for automated testing. Please add a test for every function added to the package. Running tests requires the [Docker daemon][] up, then run at the root of the repository: ```bash pytest --ignore=tests/storage --ignore=tests/permission ``` in the root of the repository. We exclude specific directories in local `pytest` runs because they directly access external resources such as AWS, which require specific access keys. Continuous integration will automatically run **all** tests on pull requests. ## Code-style This project uses [pre-commit][] to enforce consistent code-styles. On every commit, pre-commit checks will either automatically fix issues with the code, or raise an error message. To enable pre-commit locally, simply run ```bash pre-commit install ``` in the root of the repository. Pre-commit will automatically download all dependencies when it is run for the first time. We further use [gitmoji][] to add emoticons to commits. These allow us to more easily categorize them allowing for faster visual filtering. It can be installed by running: ```bash npm i -g gitmoji-cli ``` and enabled for the repository via: ```bash gitmoji -i ``` If you don't have `sudo` in your working environment, follow [these instructions](https://github.com/sindresorhus/guides/blob/main/npm-global-without-sudo.md). ## Documentation We build our documentation with an internal tool called `lndocs`. We have not made it public yet and therefore external contributors need to rely on the Github Actions `docs` job to build the documentation. If the `docs` job succeeds, a preview URL will be posted automatically as a comment to your pull request. ## Releases Currently only lamin employees have release rights. Release publishing is managed via `laminci release --pypi`. For `lamindb`, the release flow now publishes two distributions in sequence: - `lamindb-core` (contains the `lamindb/` namespace package) - `lamindb` (meta-package that depends on `lamindb-core`) Before first production publish of a version, run a TestPyPI dry run by building both wheels from `pyproject.toml` and `pyproject.full.toml`, then uploading with `twine` to TestPyPI for verification. [Docker daemon]: https://docs.docker.com/engine/install/ [gitmoji]: https://gitmoji.dev/ [pre-commit]: https://pre-commit.com/ [pytest]: https://docs.pytest.org/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ [![docs](https://img.shields.io/badge/docs-yellow)](https://docs.lamin.ai) [![llms.txt](https://img.shields.io/badge/llms.txt-orange)](https://docs.lamin.ai/llms.txt) [![codecov](https://codecov.io/gh/laminlabs/lamindb/branch/main/graph/badge.svg?token=VKMRJ7OWR3)](https://codecov.io/gh/laminlabs/lamindb) [![pypi](https://img.shields.io/pypi/v/lamindb?color=blue&label=PyPI)](https://pypi.org/project/lamindb) [![cran](https://www.r-pkg.org/badges/version/laminr?color=green)](https://cran.r-project.org/package=laminr) [![stars](https://img.shields.io/github/stars/laminlabs/lamindb?style=flat&logo=GitHub&label=&color=gray)](https://github.com/laminlabs/lamindb) [![downloads](https://static.pepy.tech/personalized-badge/lamindb?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GRAY&left_text=%E2%AC%87%EF%B8%8F)](https://pepy.tech/project/lamindb) # LaminDB - Open-source data framework for biology LaminDB allows you to query, trace, and validate datasets and models at scale. You get context & memory through a lineage-native lakehouse that supports bio-formats, registries & ontologies while feeling as simple as a file system. Agent? [llms.txt](https://docs.lamin.ai/llms.txt)
Why? (1) Reproducing, tracing & understanding how datasets, models & results are created is critical to quality R&D. Without context, humans & agents make mistakes and cannot close feedback loops across data generation & analysis. Without memory, compute & intelligence are wasted on fragmented, non-compounding tasks — LLM context windows are small. (2) Training & fine-tuning models with thousands of datasets — across LIMS, ELNs, orthogonal assays — is now a primary path to scaling R&D. But without queryable & validated data or with data locked in organizational & infrastructure silos, it leads to garbage in, garbage out or is quite simply impossible. Imagine building software without git or pull requests: an agent's actions would be impossible to verify. While code has git and tables have dbt/warehouses, biological data has lacked a framework for managing its unique complexity. LaminDB fills the gap. It is a lineage-native lakehouse that understands bio-registries and formats (`AnnData`, `.zarr`, …) based on the established open data stack: Postgres/SQLite for metadata and cross-platform storage for datasets. By offering queries, tracing & validation in a single API, LaminDB provides the context & memory to turn messy, agentic biological R&D into a scalable process.
How? - **lineage** → track inputs & outputs of notebooks, scripts, functions & pipelines with a single line of code - **lakehouse** → manage, monitor & validate schemas for standard and bio formats; query across many datasets - **FAIR datasets** → validate & annotate `DataFrame`, `AnnData`, `SpatialData`, `parquet`, `zarr`, … - **LIMS & ELN** → programmatic experimental design with bio-registries, ontologies & markdown notes - **unified access** → storage locations (local, S3, GCP, …), SQL databases (Postgres, SQLite) & ontologies - **reproducible** → auto-track source code & compute environments with data & code versioning - **change management** → branching & merging similar to git, plan management for agents - **zero lock-in** → runs anywhere on open standards (Postgres, SQLite, `parquet`, `zarr`, etc.) - **scalable** → you hit storage & database directly through your `pydata` or R stack, no REST API involved - **simple** → just `pip install` from PyPI or `install.packages('laminr')` from CRAN - **distributed** → zero-copy & lineage-aware data sharing across infrastructure (databases & storage locations) - **integrations** → [git](https://docs.lamin.ai/track#sync-code-with-git), [nextflow](https://docs.lamin.ai/nextflow), [vitessce](https://docs.lamin.ai/vitessce), [redun](https://docs.lamin.ai/redun), and [more](https://docs.lamin.ai/integrations) - **extensible** → create custom plug-ins based on the Django ORM, the basis for LaminDB's registries GUI, permissions, audit logs? [LaminHub](https://lamin.ai) is a collaboration hub built on LaminDB similar to how GitHub is built on git.
Who? Scientists and engineers at leading research institutions and biotech companies, including: - **Industry** → Pfizer, Altos Labs, Ensocell Therapeutics, ... - **Academia & Research** → scverse, DZNE (National Research Center for Neuro-Degenerative Diseases), Helmholtz Munich (National Research Center for Environmental Health), ... - **Research Hospitals** → Global Immunological Swarm Learning Network: Harvard, MIT, Stanford, ETH Zürich, Charité, U Bonn, Mount Sinai, ... From personal research projects to pharma-scale deployments managing petabytes of data across: entities | OOMs --- | --- observations & datasets | 10¹² & 10⁶ runs & transforms| 10⁹ & 10⁵ proteins & genes | 10⁹ & 10⁶ biosamples & species | 10⁵ & 10² ... | ...
## Quickstart To install the Python package with recommended dependencies, use: ```shell pip install lamindb ```
Install with minimal dependencies. The `lamindb` package adds data-science related dependencies, those that come with the `[full]` extra, see [here](https://github.com/laminlabs/lamindb/blob/2cc91adcf6077c5af69c1a098699085bb0844083/pyproject.toml#L30-L49). If you want a maximally lightweight install of the `lamindb` namespace, use: ```shell pip install lamindb-core ``` This suffices to support the basic functionality but you will get an `ImportError` if you're e.g. trying to validate a `DataFrame` because that requires `pandera`.
### Query databases & load artifacts You can browse public databases at [lamin.ai/explore](https://lamin.ai/explore). To query [laminlabs/cellxgene](https://lamin.ai/laminlabs/cellxgene), run: ```python import lamindb as ln db = ln.DB("laminlabs/cellxgene") # a database object for queries df = db.Artifact.to_dataframe() # a dataframe listing datasets & models ``` To get a [specific dataset](https://lamin.ai/laminlabs/cellxgene/artifact/BnMwC3KZz0BuKftR), run: ```python artifact = db.Artifact.get("BnMwC3KZz0BuKftR") # a metadata object for a dataset artifact.describe() # describe the context of the dataset ```
See the output.
Access the content of the dataset via: ```python local_path = artifact.cache() # return a local path from a cache adata = artifact.load() # load object into memory accessor = artifact.open() # return a streaming accessor ``` You can query by biological entities like `Disease` through plug-in `bionty`: ```python alzheimers = db.bionty.Disease.get(name="Alzheimer disease") df = db.Artifact.filter(diseases=alzheimers).to_dataframe() ``` ### Configure your database You can create a LaminDB instance at [lamin.ai](https://lamin.ai) and invite collaborators. To connect to an existing instance, run: ```shell # log into LaminHub lamin login # then either lamin connect account/name # connect globally in your environment # or lamin connect --here account/name # connect in your current development directory ``` If you prefer to init a new instance instead (no login required), run: ```shell lamin init --storage ./quickstart-data --modules bionty ``` For more configuration, read: [docs.lamin.ai/setup](https://docs.lamin.ai/setup). On the terminal and in a Python session, LaminDB will now auto-connect. ### Save files & folders as artifacts To save a file or folder via the API: ```python import lamindb as ln # → connected lamindb: account/instance open("sample.fasta", "w").write(">seq1\nACGT\n") # create dataset ln.Artifact("sample.fasta", key="sample.fasta").save() # save dataset ``` To save a file or folder via the CLI, run: ```shell lamin save sample.fasta --key sample.fasta ``` To load an artifact via the CLI into a local cache, run: ```shell lamin load --key sample.fasta ``` Read more about the CLI: [docs.lamin.ai/cli](https://docs.lamin.ai/cli). ### Lineage: scripts & notebooks To create a dataset while tracking source code, inputs, outputs, logs, and environment: ```python import lamindb as ln # → connected lamindb: account/instance ln.track() # track code execution open("sample.fasta", "w").write(">seq1\nACGT\n") # create dataset ln.Artifact("sample.fasta", key="sample.fasta").save() # save dataset ln.finish() # mark run as finished ``` Running this snippet as a script (`python create-fasta.py`) produces the following data lineage: ```python artifact = ln.Artifact.get(key="sample.fasta") # get artifact by key artifact.describe() # context of the artifact artifact.view_lineage() # fine-grained lineage ``` Watch a mini video: [youtu.be/jwnHu1PbA9Q](https://youtu.be/jwnHu1PbA9Q)
Access run & transform. ```python run = artifact.run # get the run object transform = artifact.transform # get the transform object run.describe() # context of the run ``` ```python transform.describe() # context of the transform ```
Track a project or an agent plan. Pass a project/artifact to `ln.track()`, for example: ```python ln.track(project="My project", plan="./plans/curate-dataset-x.md") ``` Note that you have to create a project or save the agent plan in case they don't yet exist: ```shell # create a project with the CLI lamin create project "My project" # save an agent plan with the CLI lamin save /path/to/.cursor/plans/curate-dataset-x.plan.md lamin save /path/to/.claude/plans/curate-dataset-x.md ``` Or in Python: ```python ln.Project(name="My project").save() # create a project in Python ```
### Lineage: functions & workflows You can achieve the same traceability for functions & workflows: ```python import lamindb as ln @ln.flow() def create_fasta(fasta_file: str = "sample.fasta"): open(fasta_file, "w").write(">seq1\nACGT\n") # create dataset ln.Artifact(fasta_file, key=fasta_file).save() # save dataset if __name__ == "__main__": create_fasta() ``` Beyond what you get for scripts & notebooks, this automatically tracks function & CLI params and integrates well with established Python workflow managers: [docs.lamin.ai/track](https://docs.lamin.ai/track). To integrate advanced bioinformatics pipeline managers like Nextflow, see [docs.lamin.ai/pipelines](https://docs.lamin.ai/pipelines).
A richer example. Here is an automatically generated re-construction of the project of [Schmidt _et al._ (Science, 2022)](https://pubmed.ncbi.nlm.nih.gov/35113687/): A phenotypic CRISPRa screening result is integrated with scRNA-seq data. Here is the result of the screen input: You can explore it [here](https://lamin.ai/laminlabs/lamindata/artifact/W1AiST5wLrbNEyVq) on LaminHub or [here](https://github.com/laminlabs/schmidt22) on GitHub.
### Labeling & queries by fields You can label an artifact by running: ```python my_label = ln.ULabel(name="My label").save() # a universal label project = ln.Project(name="My project").save() # a project label artifact.ulabels.add(my_label) artifact.projects.add(project) ``` Query for it: ```python ln.Artifact.filter(ulabels=my_label, projects=project).to_dataframe() ``` You can also query by the metadata that lamindb automatically collects: ```python ln.Artifact.filter(run=run).to_dataframe() # by creating run ln.Artifact.filter(transform=transform).to_dataframe() # by creating transform ln.Artifact.filter(size__gt=1e6).to_dataframe() # size greater than 1MB ``` If you want to include more information into the resulting dataframe, pass `include`. ```python ln.Artifact.to_dataframe(include=["created_by__name", "storage__root"]) # include fields from related registries ``` Note: The query syntax for `DB` objects and for your default database is the same. ### The core data model Here is an overview that illustrates how `Artifact` links to all other registries: ### Queries by features You can annotate datasets and samples with features. Let's define some: ```python from datetime import date ln.Feature(name="gc_content", dtype=float).save() ln.Feature(name="experiment_note", dtype=str).save() ln.Feature(name="experiment_date", dtype=date, coerce=True).save() # accept date strings ``` During annotation, feature names and data types are validated against these definitions. ```python artifact.features.set_values({ "gc_content": 0.55, "experiment_note": "Looks great", "experiment_date": "2025-10-24", }) ``` Query for it: ```python ln.Artifact.filter(experiment_date="2025-10-24").to_dataframe() # query all artifacts annotated with `experiment_date` ``` If you want to include the feature values into the dataframe, pass `include`. ```python ln.Artifact.to_dataframe(include="features") # include the feature annotations ``` ### Lake ♾️ LIMS ♾️ Sheets You can create records for the entities underlying your experiments: samples, perturbations, instruments, etc., for example: ```python ln.Record(name="Sample 1", features={"gc_content": 0.5}).save() ``` You can create relationships of entities: ```python # create a flexible record type to track experiments experiment_type = ln.Record(name="Experiment", is_type=True).save() # create a record of type `Experiment` for your first experiment ln.Record(name="Experiment 1", type=experiment_type).save() # create a feature to link experiments in records, dataframes, etc. ln.Feature(name="experiment", dtype=experiment_type).save() # create a sample record that links the sample to `Experiment 1` via the `experiment` feature ln.Record(name="Sample 2", features={"gc_content": 0.5, "experiment": "Experiment 1"}).save() ``` You can convert any record type to dataframe/sheet: ```python experiment_type.to_dataframe() ```
You can edit records like Excel sheets on LaminHub.
### Data versioning If you change source code or datasets, LaminDB manages versioning for you. Assume you run a new version of our `create-fasta.py` script to create a new version of `sample.fasta`. ```python import lamindb as ln ln.track() open("sample.fasta", "w").write(">seq1\nTGCA\n") # a new sequence ln.Artifact("sample.fasta", key="sample.fasta", features={"experiment": "Experiment 1"}).save() # annotate with the new experiment ln.finish() ``` If you now query by `key`, you'll get the latest version of this artifact: ```python artifact = ln.Artifact.get(key="sample.fasta") # get artifact by key artifact.versions.to_dataframe() # see all versions of that artifact ``` ### Change management To create a contribution branch and switch to it, run: ```shell lamin switch -c my_branch ``` To merge a contribution branch into `main`, run: ```shell lamin switch main # switch to the main branch lamin merge my_branch # merge contribution branch into main ``` Read more: [docs.lamin.ai/lamindb.branch](https://docs.lamin.ai/lamindb.branch). ### Data sharing To share data in a lineage-aware way, sync objects from a source database to your default database: ```python db = ln.DB("laminlabs/lamindata") artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad") artifact.save() ``` This is zero-copy for the artifact's data in storage. Read more: [docs.lamin.ai/sync](https://docs.lamin.ai/sync). ### Lakehouse ♾️ feature store Here is how you ingest a `DataFrame`: ```python import pandas as pd df = pd.DataFrame({ "sequence_str": ["ACGT", "TGCA"], "gc_content": [0.55, 0.54], "experiment_note": ["Looks great", "Ok"], "experiment_date": [date(2025, 10, 24), date(2025, 10, 25)], }) ln.Artifact.from_dataframe(df, key="my_datasets/sequences.parquet").save() # no validation ``` To validate & annotate the content of the dataframe, use the built-in schema `valid_features`: ```python ln.Feature(name="sequence_str", dtype=str).save() # define a remaining feature artifact = ln.Artifact.from_dataframe( df, key="my_datasets/sequences.parquet", schema="valid_features" # validate columns against features ).save() artifact.describe() ``` Watch a mini video: [youtu.be/Ji6E7hTnReQ](https://youtu.be/Ji6E7hTnReQ) You can filter for datasets by schema and then launch distributed queries and batch loading. ### Lakehouse beyond tables To validate an `AnnData` with built-in schema `ensembl_gene_ids_and_valid_features_in_obs`, call: ```python import anndata as ad import numpy as np import pandas as pd adata = ad.AnnData( X=np.ones((21, 10)), obs=pd.DataFrame({'cell_type_by_model': ['T cell', 'B cell', 'NK cell'] * 7}), var=pd.DataFrame(index=[f'ENSG{i:011d}' for i in range(10)]) ) artifact = ln.Artifact.from_anndata( adata, key="my_datasets/scrna.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs" ).save() artifact.describe() ``` To validate a `SpatialData` or any other array-like dataset, you need to construct a `Schema`. You can do this by composing simple `pandera`-style schemas: [docs.lamin.ai/curate](https://docs.lamin.ai/curate). ### Ontologies Plugin `bionty` gives you >20 public ontologies as `SQLRecord` registries. This was used to validate the `ENSG` ids in the `adata` just before. ```python import bionty as bt bt.CellType.import_source() # import the default ontology bt.CellType.to_dataframe() # your extensible cell type ontology in a simple registry ``` You can then create objects, e.g. for labeling, analogous to `ULabel`, `Project`, or `Record`: ```python t_cell = bt.CellType.get(name="T cell") artifact.cell_types.add(t_cell) ``` Read more: [docs.lamin.ai/manage-ontologies](https://docs.lamin.ai/manage-ontologies). Watch a mini video: [youtu.be/3vpWjHj3Kw8](https://youtu.be/3vpWjHj3Kw8) ### Save unstructured notes When in your development directory, you can save markdown files as records: ```shell lamin save / ``` ================================================ FILE: docs/api.md ================================================ # API Reference ```{toctree} :maxdepth: 1 :caption: CLI & lamindb :hidden: cli lamindb ``` ```{toctree} :maxdepth: 1 :caption: Modules :hidden: bionty pertdb ``` ================================================ FILE: docs/arrays.md ================================================ --- execute_via: python --- # Stream datasets from storage This guide walks through streaming datasets from disk or cloud storage. ```python # replace with your username and S3 bucket !lamin login testuser1 !lamin init --storage s3://lamindb-ci/test-arrays ``` Import lamindb and track this notebook. ```python import lamindb as ln import numpy as np ln.track() db = ln.DB("laminlabs/lamindata") # we'll pull dataset from there ``` ## DataFrame ### Streaming from a single artifact A dataframe stored as sharded `parquet`. ```python artifact = db.Artifact.get(key="sharded_parquet") ``` ```python artifact.path.view_tree() ``` ```python dataset = artifact.open() ``` This returns a [pyarrow dataset](https://arrow.apache.org/docs/python/dataset.html). ```python dataset ``` ```python dataset.head(5).to_pandas() ``` ### Streaming from a set of artifacts You can open several parquet files as a single dataset by calling `.open()` on the result of a query: ```python dataset = db.Artifact.filter( key__startswith="example_datasets/small", suffix=".parquet", is_latest=True ).open() # open an ArtifactSet for streaming dataset ``` The same is possible for the artifacts in a collection: ```python collection = db.Collection.get(key="sharded_parquet_collection") dataset = collection.open() dataset ``` Once you have a storage-backed dataset, you can query it like this: ```python dataset.to_table().to_pandas() ``` By default `Artifact.open()` and `Collection.open()` use `pyarrow` to lazily open dataframes. `polars` can be also used by passing `engine="polars"`. Note also that `.open(engine="polars")` returns a context manager with [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html). ```python with collection.open(engine="polars", use_fsspec=True) as lazy_df: display(lazy_df.collect().to_pandas()) ``` ## AnnData We'll need some test data: ```python ln.Artifact("s3://lamindb-ci/test-arrays/pbmc68k.h5ad").save() ln.Artifact("s3://lamindb-ci/test-arrays/testfile.hdf5").save() ``` An `h5ad` artifact stored on s3: ```python artifact = ln.Artifact.get(key="pbmc68k.h5ad") ``` ```python artifact.path ``` ```python adata = artifact.open() ``` This object is an `AnnDataAccessor` object, an `AnnData` object backed in the cloud: ```python adata ``` Without subsetting, the `AnnDataAccessor` object references underlying lazy `h5` or `zarr` arrays: ```python adata.X ``` You can subset it like a normal `AnnData` object: ```python obs_idx = adata.obs.cell_type.isin(["Dendritic cells", "CD14+ Monocytes"]) & ( adata.obs.percent_mito <= 0.05 ) adata_subset = adata[obs_idx] adata_subset ``` Subsets load arrays into memory upon direct access: ```python adata_subset.X ``` To load the entire subset into memory as an actual `AnnData` object, use `to_memory()`: ```python adata_subset.to_memory() ``` It is also possible to add columns to `.obs` and `.var` of cloud AnnData objects without downloading them. First, create a new `AnnData` `zarr` artifact: ```python adata_subset.to_memory().write_zarr("adata_subset.zarr") artifact = ln.Artifact( "adata_subset.zarr", description="test add column to adata" ).save() ``` This is how you add a column: ```python with artifact.open(mode="r+") as adata_accessor: adata_accessor.add_column(where="obs", col_name="ones", col=np.ones(adata_accessor.shape[0])) display(adata_accessor) ``` The version of the artifact is updated after the modification. ```python artifact ``` ```python artifact.delete(permanent=True) ``` ## SpatialData It is also possible to access `AnnData` objects inside `SpatialData` `tables`: ```python artifact = ln.Artifact.connect("laminlabs/lamindata").get( key="visium_aligned_guide_min.zarr" ) access = artifact.open() ``` ```python access ``` ```python access.tables ``` This gives you the same `AnnDataAccessor` object as for a normal `AnnData`. ```python table = access.tables["table"] table ``` You can subset it and read into memory as an actual `AnnData`: ```python table_subset = table[table.obs["clone"] == "diploid"] table_subset ``` ```python adata = table_subset.to_memory() ``` ## Generic HDF5 Let us query a generic HDF5 artifact: ```python artifact = ln.Artifact.get(key="testfile.hdf5") ``` And get a backed accessor: ```python backed = artifact.open() ``` The returned object contains the `.connection` and `h5py.File` or `zarr.Group` in `.storage` ```python backed ``` ```python backed.storage ``` ```python # clean up test instance ln.setup.delete("test-arrays", force=True) ``` ================================================ FILE: docs/bionty.md ================================================ # `bionty` ```{eval-rst} .. automodule:: bionty ``` ================================================ FILE: docs/changelog.md ================================================ # Changelog Actual content in lamin-docs. ================================================ FILE: docs/curate.md ================================================ --- execute_via: python --- # Validate & standardize datasets Data curation with LaminDB ensures your datasets are **validated** and **queryable** through **annotation**. ```{raw} html ``` Curating a dataset with LaminDB means three things: - **Validate** that the dataset matches a desired schema. - **Standardize** the dataset (e.g., by fixing typos, mapping synonyms) or update registries if validation fails. - **Annotate** the dataset by linking it against metadata entities so that it becomes queryable. In this guide we'll curate common data structures. Here is a [guide](/faq/curate-any) for the underlying low-level API. Note: If you know either `pydantic` or `pandera`, here is an [FAQ](/faq/pydantic-pandera) that compares LaminDB with both of these tools. ```python # pip install lamindb !lamin init --storage ./test-curate --modules bionty ``` ```python import lamindb as ln ln.track() ``` ## Schema design patterns A {class}`~lamindb.Schema` in LaminDB is a specification that defines the expected structure, data types, and validation rules for a dataset. It is similar to `pydantic.Model` for dictionaries, and `pandera.Schema`, and `pyarrow.lib.Schema` for tables, but supporting more complicated data structures. Schemas ensure data consistency by defining: - What {class}`~lamindb.Feature`s (dimensions) exist in your dataset - What data types those features should have - What values are valid for categorical features - Which {class}`~lamindb.Feature`s are required vs optional An exemplary schema: ```python schema = ln.Schema( name="experiment_schema", # human-readable name features=[ # required features ln.Feature(name="cell_type", dtype=bt.CellType), ln.Feature(name="treatment", dtype=str), ], otype="DataFrame" # object type (DataFrame, AnnData, etc.) ) ``` For composite data structures using slots: ```{dropdown} What are slots? For composite data structures, you need to specify which component contains which schema, for example, to validate both cell metadata in `.obs` and gene metadata in `.var` within the same schema. Each slot is a key like `"obs"` for AnnData observations,`"rna:var"` for MuData modalities, or `"attrs:nested:key"` for SpatialData annotations. ``` ```python # AnnData with multiple "slots" adata_schema = ln.Schema( otype="AnnData", slots={ "obs": cell_metadata_schema, # cell annotations "var.T": gene_id_schema # gene-derived features } ) ``` Before diving into curation, let's understand the different schema approaches and when to use each one. Think of schemas as rules that define what valid data should look like. ### Flexible schema Use when: You want to validate those columns whose names match feature names in your `Feature` registry. ```{eval-rst} .. literalinclude:: scripts/define_valid_features.py :language: python ``` ### Minimal required schema Use when: You need certain columns but want flexibility for additional metadata. ```{eval-rst} .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py :language: python ``` ### Strict Schema Use when: You need complete control over data structure and values. ```python # Only allows specified columns schema = ln.Schema( features=[...], minimal_set=True, # whether all passed features are required maximal_set=False # whether additional features are allowed ) ``` ## DataFrame ### Step 1: Load and examine your data We'll be working with the mini immuno dataset: ```python df = ln.examples.datasets.mini_immuno.get_dataset1( with_cell_type_synonym=True, with_cell_type_typo=True ) df ``` ### Step 2: Set up your metadata registries Before creating a schema, ensure your registries have the right features and labels: ```{eval-rst} .. literalinclude:: scripts/define_mini_immuno_features_labels.py :language: python ``` ### Step 3: Create your schema ```python schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible() schema.describe() ``` ### Step 4: Initialize Curator and first validation If you expect the validation to pass, you can directly register an artifact by providing the schema: ```python artifact = ln.Artifact.from_dataframe(df, key="examples/my_curated_dataset.parquet", schema=schema).save() ``` The {meth}`~lamindb.curators.core.Curator.validate` method validates that your dataset adheres to the criteria defined by the `schema`. It identifies which values are already validated (exist in the registries) and which are potentially problematic (do not yet exist in our registries). ```python try: curator = ln.curators.DataFrameCurator(df, schema) curator.validate() except ln.errors.ValidationError as error: print(error) ``` ### Step 5: Fix validation issues ```python # check the non-validated terms curator.cat.non_validated ``` For `cell_type_by_expert`, we saw 2 terms are not validated. First, let's standardize synonym "B-cell" as suggested ```python curator.cat.standardize("cell_type_by_expert") ``` ```python # now we have only one non-validated cell type left curator.cat.non_validated ``` For "CD8-pos alpha-beta T cell", let's understand which cell type in the public ontology might be the actual match. ```python # to check the correct spelling of categories, pass `public=True` to get a lookup object from public ontologies # use `lookup = curator.cat.lookup()` to get a lookup object of existing records in your instance lookup = curator.cat.lookup(public=True) lookup ``` ```python # here is an example for the "cell_type" column cell_types = lookup["cell_type_by_expert"] cell_types.cd8_positive_alpha_beta_t_cell ``` ```python # fix the cell type name df["cell_type_by_expert"] = df["cell_type_by_expert"].cat.rename_categories( {"CD8-pos alpha-beta T cell": cell_types.cd8_positive_alpha_beta_t_cell.name} ) ``` For perturbation, we want to add the new values: "DMSO", "IFNG" ```python # this adds perturbations that were _not_ validated curator.cat.add_new_from("perturbation") ``` ```python ln.Feature.get(name="perturbation") ``` ```python # validate again curator.validate() ``` ### Step 6: Save your curated dataset ```python artifact = curator.save_artifact(key="examples/my_curated_dataset.parquet") ``` ```python artifact.describe() ``` ## Common fixes This section covers the most frequent curation issues and their solutions. Use this as a reference when validation fails. ### Feature validation issues **Issue**: "Column not in dataframe" ``` "column 'treatment' not in dataframe. Columns in dataframe: ['drug', 'timepoint', ...]" ``` **Solutions**: ```python # Solution 1: Rename columns to match schema df = df.rename(columns={ 'treatment': 'drug', 'time': 'timepoint', ... }) # Solution 2: Create missing columns df['treatment'] = 'unknown' # Add with default value (or define Feature.default_value) # Solution 3: Modify schema to match your data schema = ln.Schema( features=[ ln.Feature.get(name="drug"), # Use actual column name ln.Feature.get(name="timepoint"), ], ... ) ``` ### Value validation issues **Issue**: "Terms not validated in feature 'perturbation'" ``` 2 terms not validated in feature 'cell_type': 'B-cell', 'CD8-pos alpha-beta T cell' 1 synonym found: "B-cell" → "B cell" → curate synonyms via: .standardize("cell_type") for remaining terms: → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('cell_type') ``` **Solutions**: ```python # Solution 1: Use automatic standardization if given hint (handles synonyms)) curator.cat.standardize('cell_type') # Solution 2: Manual mapping for complex cases value_mapping = { 'T-cells': 'T cell', 'B-cells': 'B cell', } df['cell_type'] = df['cell_type'].map(value_mapping).fillna(df['cell_type']) # Solution 3: Use public ontology lookup for correct names lookup = curator.cat.lookup(public=True) cell_types = lookup["cell_type"] df['cell_type'] = df['cell_type'].cat.rename_categories({ 'CD8-pos T cell': cell_types.cd8_positive_alpha_beta_t_cell.name }) # Solution 4: Add new legitimate terms curator.cat.add_new_from("cell_type") ``` ### Data type issues **Issue**: "Expected categorical data, got object" ``` TypeError: Expected categorical data for cell_type, got object ``` **Solutions**: ```python # Solution 1: Convert to categorical df['cell_type'] = df['cell_type'].astype('category') # Solution 2: Use coercion in feature definition ln.Feature(name="cell_type", dtype=bt.CellType, coerce=True).save() ``` ### Organism-specific ontology issues **Issue**: "Terms not validated" for organism-specific ontologies like developmental stages ``` 2 terms not validated in feature 'developmental_stage_ontology_id': 'MmusDv:0000142', 'MmusDv:0000022' ``` **Solution**: Specify organism-specific source in feature definition using `cat_filters`: ```python # When defining the schema, specify the organism-specific source mouse_source = bt.Source.filter( entity="bionty.DevelopmentalStage", organism="mouse" ).one() schema = ln.Schema( features=[ ln.Feature( name="developmental_stage_ontology_id", dtype=bt.DevelopmentalStage.ontology_id, cat_filters={"source": mouse_source} # Specify organism-specific source ) ], ... ) ``` This pattern applies to any ontology where the same registry serves multiple organisms (e.g., `DevelopmentalStage`, `Phenotype`, ...). ## External data validation Since not all metadata is always stored within the dataset itself, it is also possible to validate external metadata. ```{eval-rst} .. literalinclude:: scripts/curate_dataframe_external_features.py :language: python :caption: curate_dataframe_external_features.py ``` ```python !python scripts/curate_dataframe_external_features.py ``` ## Union dtypes Some metadata columns might validate against several registries. ```{eval-rst} .. literalinclude:: scripts/curate_dataframe_union_features.py :language: python :caption: curate_dataframe_union_features.py ``` ```python !python scripts/curate_dataframe_union_features.py ``` ## AnnData `AnnData` like all other data structures that follow is a composite structure that stores different arrays in different `slots`. ### Allow a flexible schema We can also allow a flexible schema for an `AnnData` and only require that it's indexed with Ensembl gene IDs. ```{eval-rst} .. literalinclude:: scripts/curate_anndata_flexible.py :language: python :caption: curate_anndata_flexible.py ``` Let's run the script. ```python !python scripts/curate_anndata_flexible.py ``` Under-the-hood, this uses the following build-in schema ({func}`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`): ```{eval-rst} .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py :language: python ``` This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`. If one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`. ```{eval-rst} .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png :width: 800px ``` ### Fix validation issues ```python adata = ln.examples.datasets.mini_immuno.get_dataset1( with_gene_typo=True, with_cell_type_typo=True, otype="AnnData" ) adata ``` ```python schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs() schema.describe() ``` Check the slots of a schema: ```python schema.slots ``` ```python curator = ln.curators.AnnDataCurator(adata, schema) try: curator.validate() except ln.errors.ValidationError as error: print(error) ``` As above, we leverage a lookup object with valid cell types to find the correct name. ```python valid_cell_types = curator.slots["obs"].cat.lookup()["cell_type_by_expert"] adata.obs["cell_type_by_expert"] = adata.obs[ "cell_type_by_expert" ].cat.rename_categories( {"CD8-pos alpha-beta T cell": valid_cell_types.cd8_positive_alpha_beta_t_cell.name} ) ``` The validated `AnnData` can be subsequently saved as an {class}`~lamindb.Artifact`: ```python adata.obs.columns ``` ```python curator.slots["var.T"].cat.add_new_from("columns") ``` ```python curator.validate() ``` ```python artifact = curator.save_artifact(key="examples/my_curated_anndata.h5ad") ``` Access the schema for each slot: ```python artifact.features.slots ``` The saved artifact has been annotated with validated features and labels: ```python artifact.describe() ``` ## Unstructured dictionaries Most datastructures support unstructured metadata stored as dictionaries: - Pandas DataFrames: `.attrs` - AnnData: `.uns` - MuData: `.uns` and `modality:uns` - SpatialData: `.attrs` Here, we exemplary show how to curate such metadata for AnnData: ```{eval-rst} .. literalinclude:: scripts/define_schema_anndata_uns.py :language: python :caption: define_schema_anndata_uns.py ``` ```python !python scripts/define_schema_anndata_uns.py ``` ```{eval-rst} .. literalinclude:: scripts/curate_anndata_uns.py :language: python :caption: curate_anndata_uns.py ``` ```python !python scripts/curate_anndata_uns.py ``` ## MuData ```{eval-rst} .. literalinclude:: scripts/curate_mudata.py :language: python :caption: curate_mudata.py ``` ```python !python scripts/curate_mudata.py ``` ## SpatialData ```{eval-rst} .. literalinclude:: scripts/define_schema_spatialdata.py :language: python :caption: define_schema_spatialdata.py ``` ```python !python scripts/define_schema_spatialdata.py ``` ```{eval-rst} .. literalinclude:: scripts/curate_spatialdata.py :language: python :caption: curate_spatialdata.py ``` ```python !python scripts/curate_spatialdata.py ``` ## TiledbsomaExperiment ```{eval-rst} .. literalinclude:: scripts/curate_soma_experiment.py :language: python :caption: curate_soma_experiment.py ``` ```python !python scripts/curate_soma_experiment.py ``` ## Other data structures If you have other data structures, read: {doc}`/faq/curate-any`. ```python !rm -rf ./test-curate !rm -rf ./small_dataset.tiledbsoma !lamin delete --force test-curate ``` ================================================ FILE: docs/faq/acid.md ================================================ --- execute_via: python --- # Will data & metadata stay in sync? Here, we walk through different errors that can occur while saving artifacts & metadata records, and show that the LaminDB instance does not get corrupted by dangling metadata or artifacts. Transactions within Python across data & metadata are [ACID](https://en.wikipedia.org/wiki/ACID). If an upload process is externally killed and Python cannot run clean-up operations anymore, the artifact is internally still flagged with `artifact._storage_ongoing = True`. This is visible on the UI. You can then re-run `lamin save` or `artifact.save()` to attempt uploading the artifact a second time. ```python !lamin init --storage ./test-acid ``` ```python import pytest import lamindb as ln from upath import UPath ln.settings.verbosity = "debug" ``` ```python open("sample.fasta", "w").write(">seq1\nACGT\n") ``` ## Save error due to failed upload within Python Let's try to save an artifact to a storage location without permission. ```python artifact = ln.Artifact("sample.fasta", key="sample.fasta") ``` Because the public API only allows you to set a default storage for which you have permission, we need to hack it: ```python ln.settings.storage._root = UPath("s3://nf-core-awsmegatests") ``` This raises an exception but nothing gets saved: ```python with pytest.raises(PermissionError) as error: artifact.save() print(error.exconly()) assert len(ln.Artifact.filter()) == 0 ``` ## Save error during bulk creation ```python artifacts = [artifact, "this is not a record"] ``` This raises an exception but nothing gets saved: ```python with pytest.raises(Exception) as error: ln.save(artifacts) print(error.exconly()) assert len(ln.Artifact.filter()) == 0 # nothing got saved ``` If a list of data objects is passed to `ln.save()` and the upload of one of these data objects fails, the successful uploads are maintained and a `RuntimeError` is raised, listing the successfully uploaded data objects up until that point. ## Save error due to externally aborted upload Back to a proper storage location: ```python ln.settings.storage._root = UPath("./test-acid").absolute() ``` The save operation works: ```python artifact.save() ``` Let's pretend the upload was killed. ```python artifact._storage_ongoing = True artifact.save() artifact.path.unlink() assert artifact._aux == {"so": 1} # storage/upload is ongoing ``` We can re-run it: ```python artifact = ln.Artifact("sample.fasta", key="sample.fasta").save() ``` ```python assert not artifact._storage_ongoing assert artifact._aux is None ``` ```python !rm -r ./test-acid !lamin delete --force test-acid ``` ================================================ FILE: docs/faq/curate-any.md ================================================ --- execute_via: python --- # How do I validate & annotate arbitrary data structures? This guide walks through the low-level API that lets you validate iterables. You can then use the records create inferred during validation to annotate a dataset. :::{dropdown} How do I validate based on a public ontology? LaminDB makes it easy to validate categorical variables based on registries that inherit from {class}`~lamindb.models.CanCurate`. {class}`~lamindb.models.CanCurate` methods validate against the registries in your LaminDB instance. In {doc}`/manage-ontologies`, you'll see how to extend standard validation to validation against _public references_ using a `PubliOntology` object, e.g., via `public_genes = bt.Gene.public()`. By default, {meth}`~lamindb.models.CanCurate.from_values` considers a match in a public reference a validated value for any {mod}`bionty` entity. ::: ```python # pip install 'lamindb[zarr]' !lamin init --storage ./test-curate-any --modules bionty ``` Define a test dataset. ```python import lamindb as ln import bionty as bt import zarr import numpy as np data = zarr.open_group(store="data.zarr", mode="a") data.create_dataset(name="temperature", shape=(3,), dtype="float32") data.create_dataset(name="knockout_gene", shape=(3,), dtype=str) data.create_dataset(name="disease", shape=(3,), dtype=str) data["knockout_gene"][:] = np.array( ["ENSG00000139618", "ENSG00000141510", "ENSG00000133703"] ) data["disease"][:] = np.random.default_rng().choice( ["MONDO:0004975", "MONDO:0004980"], 3 ) ``` ## Validate and standardize vectors Read the `disease` array from the zarr group into memory. ```python disease = data["disease"][:] ``` {meth}`~lamindb.models.CanCurate.validate` validates vectore-like values against reference values in a registry. It returns a boolean vector indicating where a value has an exact match in the reference values. ```python bt.Disease.validate(disease, field=bt.Disease.ontology_id) ``` When validation fails, you can call {meth}`~lamindb.models.CanCurate.inspect` to figure out what to do. {meth}`~lamindb.models.CanCurate.inspect` applies the same definition of validation as {meth}`~lamindb.models.CanCurate.validate`, but returns a rich return value {class}`~lamindb.models.InspectResult`. Most importantly, it logs recommended curation steps that would render the data validated. Note: you can use {meth}`~lamindb.models.CanCurate.standardize` to standardize synonyms. ```python bt.Disease.inspect(disease, field=bt.Disease.ontology_id) ``` Bulk creating records using {meth}`~lamindb.models.CanCurate.from_values` only returns validated records. ```python diseases = bt.Disease.from_values(disease, field=bt.Disease.ontology_id).save() ``` Repeat the process for more labels: ```python experiments = ln.Record.from_values( ["Experiment A", "Experiment B"], field=ln.Record.name, create=True, # create non-validated labels ).save() genes = bt.Gene.from_values( data["knockout_gene"][:], field=bt.Gene.ensembl_gene_id ).save() ``` ## Annotate the dataset Register the dataset as an artifact: ```python artifact = ln.Artifact("data.zarr", key="my_dataset.zarr").save() ``` Annotate with features: ```python ln.Feature(name="experiment", dtype=ln.Record).save() ln.Feature(name="disease", dtype=bt.Disease.ontology_id).save() ln.Feature(name="knockout_gene", dtype=bt.Gene.ensembl_gene_id).save() artifact.features.set_values( {"experiment": experiments, "knockout_gene": genes, "disease": diseases} ) artifact.describe() ``` ```python # clean up test instance !rm -r data.zarr !rm -r ./test-curate-any !lamin delete --force test-curate-any ``` ================================================ FILE: docs/faq/idempotency.md ================================================ --- execute_via: python --- # Will data get duplicated upon re-running code? LaminDB's operations are idempotent in the sense defined here, which allows you to re-run code without duplicating data. :::{admonition} SQLRecords with `name` field When you instantiate {class}`~lamindb.models.SQLRecord` with a name, in case a name has an _exact match_ in a registry, the constructor returns it instead of creating a new record. In case records with _similar names_ exist, you'll see them in a table: you can then decide whether you want to save the new record or pick an existing record. If you set {attr}`~lamindb.core.subsettings.CreationSettings.search_names` to `False`, you bypass these checks. ::: :::{admonition} Artifacts & collections If you instantiate {class}`~lamindb.Artifact` from data that already exists as an artifact, the `Artifact()` constructor returns the existing artifact based on a hash lookup. ::: ```python # pip install lamindb !lamin init --storage ./test-idempotency ``` ```python import lamindb as ln ln.track("ANW20Fr4eZgM0000") ``` ## SQLRecords with name field ```python assert ln.settings.creation.search_names ``` Let us add a first record to the {class}`~lamindb.Record` registry: ```python label = ln.Record(name="My label 1").save() ``` If we create a new record, we'll automatically get search results that give clues on whether we are prone to duplicating an entry: ```python label = ln.Record(name="My label 1a") ``` Let's save the `1a` label, we actually intend to create it. ```python label.save() ``` In case we match an existing name directly, we'll get the existing object: ```python label = ln.Record(name="My label 1") ``` If we save it again, it will not create a new entry in the registry: ```python label.save() ``` Now, if we create a third record, we'll get two alternatives: ```python label = ln.Record(name="My label 1b") ``` If we prefer to not perform a search, e.g. for performance reasons, we can switch it off. ```python ln.settings.creation.search_names = False label = ln.Record(name="My label 1c") ``` Switch it back on: ```python ln.settings.creation.search_names = True ``` ## Artifacts & collections ```python filepath = ln.examples.datasets.file_fcs() ``` Create an `Artifact`: ```python artifact = ln.Artifact(filepath, key="my_fcs_file.fcs").save() ``` ```python assert artifact.hash == "rCPvmZB19xs4zHZ7p_-Wrg" assert artifact.run == ln.context.run assert not artifact.recreating_runs.exists() ``` Create an `Artifact` from the same path: ```python artifact2 = ln.Artifact(filepath, key="my_fcs_file.fcs") ``` It gives us the existing object: ```python assert artifact.id == artifact2.id assert artifact.run == artifact2.run assert not artifact.recreating_runs.exists() ``` If you save it again, nothing will happen (the operation is idempotent): ```python artifact2.save() ``` In the hidden cell below, you'll see how this interplays with data lineage. ```python ln.track(new_run=True) artifact3 = ln.Artifact(filepath, key="my_fcs_file.fcs") assert artifact3.id == artifact2.id assert artifact3.run == artifact2.run != ln.context.run # run is not updated assert artifact2.recreating_runs.first() == ln.context.run ``` ```python !rm -rf ./test-idempotency !lamin delete --force test-idempotency ``` ================================================ FILE: docs/faq/import-modules.md ================================================ --- execute_via: python --- # What happens if I import a schema module without lamindb? ```python # !pip install 'lamindb[bionty]' !lamin init --storage testmodule --modules bionty ``` Upon `import`, nothing yet happens: ```python import bionty as bt ``` If you try to access an attribute (other than `model`), you'll load the instance in the same way as calling `import lamindb`. Under the hood, `lamindb` is imported! ```python assert bt.Organism(name="human") is not None ``` ```python !lamin delete --force testmodule ``` ================================================ FILE: docs/faq/keep-artifacts-local.md ================================================ --- execute_via: python --- # Keep artifacts local in a cloud instance If you want to default to keeping artifacts local in a cloud instance, enable {attr}`~lamindb.setup.core.InstanceSettings.keep_artifacts_local`. Let us first create a cloud instance that woul store artifacts exclusively on S3. ```python !lamin login testuser1 !lamin init --storage s3://lamindb-ci/keep-artifacts-local ``` Let's import lamindb and track the current notebook run. ```python # pip install lamindb import lamindb as ln ln.track("l9lFf83aPwRc") ``` ## Toggling setting "keep artifacts local" You can checkmark the "Keep artifacts local" box on the instance settings tab. Or toggle it through the following instance setting. ```python ln.setup.settings.instance.keep_artifacts_local = True ``` ## Create a local storage location Call the following for a -- potentially pre-existing -- root path and a unique host identifier. ```python ln.Storage(root="./our_local_storage", host="abc-institute-drive1").save() ``` Now, you have two storage locations: one in the S3 bucket, and the other locally. ```python ln.Storage.to_dataframe() ``` You can now set it as a local default storage location. Next time you connect to the instance, this won't be necessary and the location will be automatically detected as the local default. ```python ln.settings.local_storage = "./our_local_storage" ``` ## Use a local storage location If you save an artifact in keep-artifacts-local mode, by default, it's stored in local storage. ```python original_filepath = ln.examples.datasets.file_fcs() artifact = ln.Artifact(original_filepath, key="example_datasets/file1.fcs").save() local_path = artifact.path # local storage path local_path ``` You'll see the `.fcs` file named by the `uid` in your `.lamindb/` directory under `./our_local_storage/`: ```python assert artifact.path.exists() assert artifact.path.as_posix().startswith(ln.settings.local_storage.root.as_posix()) ln.settings.local_storage.root.view_tree() ``` ## Pre-existing artifacts Assume you already have a file in your local storage location: ```python file_in_local_storage = ln.examples.datasets.file_bam() file_in_local_storage.rename("./our_local_storage/output.bam") ln.UPath("our_local_storage/").view_tree() ``` When registering an artifact for it, it remains where it is. ```python my_existing_file = ln.Artifact("./our_local_storage/output.bam").save() ln.UPath("our_local_storage/").view_tree() ``` The storage path of the artifact matches the pre-existing file: ```python my_existing_file.path ``` ## Switching between local storage locations You might have several local storage locations. Here is how you can switch between them. ```python ln.Storage(root="./our_local_storage2", host="abc-institute-drive1").save() ln.settings.local_storage = "./our_local_storage2" # switch to the new storage location ``` Ingest a file into the new local storage location. ```python filepath = ln.examples.datasets.file_fastq() artifact3 = ln.Artifact(filepath, key="example_datasets/file.fastq.gz").save() ``` Inspect where all the files are. ```python ln.Artifact.to_dataframe(include=["storage__root", "storage__region"]) ``` ## Upload a local artifact to the cloud If you'd like to upload an artifact to the cloud storage location to more easily share it or view it through web applications, you pass `upload=True` to the `save()` method. ```python artifact.save(upload=True) ``` You now see the artifact in the S3 bucket: ```python ln.settings.storage.root.view_tree() ``` And it's no longer present in local storage: ```python assert artifact.path.exists() assert not local_path.exists() assert artifact.path.as_posix().startswith(ln.settings.storage.root.as_posix()) ln.settings.local_storage.root.view_tree() ``` ## Upload directly to the cloud You can also directly upload via `upload=True`: ```python filepath = ln.examples.datasets.file_mini_csv() artifact2 = ln.Artifact(filepath, key="example_datasets/mini.csv").save(upload=True) artifact2.path ``` Now we have two files on S3: ```python ln.Artifact.to_dataframe(include="storage__root") ``` ## Update storage description You can add a description to the storage location by using the `description` field. ```python storage_record = ln.Storage.get(root__endswith="our_local_storage") storage_record.description = "Our shared directory for project X" storage_record.save() ln.Storage.to_dataframe() ``` ## Delete the test instance Delete the artifacts: ```python artifact.delete(permanent=True) artifact2.delete(permanent=True) artifact3.delete(permanent=True) my_existing_file.delete(permanent=True, storage=False) ``` Delete the instance: ```python ln.setup.delete("keep-artifacts-local", force=True) ``` ================================================ FILE: docs/faq/pydantic-pandera.md ================================================ --- execute_via: python --- # Pydantic & Pandera vs. LaminDB This doc explains conceptual differences between data validation with `pydantic`, `pandera`, and `LaminDB`. ```python !lamin init --storage test-pydantic-pandera --modules bionty ``` Let us work with a test dataframe. ```python import pandas as pd import pydantic import lamindb as ln import bionty as bt import pandera.pandas as pandera import pprint from typing import Literal, Any df = ln.examples.datasets.mini_immuno.get_dataset1() df ``` ## Define a schema ### pydantic ```python Perturbation = Literal["DMSO", "IFNG"] CellType = Literal["T cell", "B cell"] OntologyID = Literal["EFO:0008913"] class ImmunoSchema(pydantic.BaseModel): perturbation: Perturbation cell_type_by_model: CellType cell_type_by_expert: CellType assay_oid: OntologyID concentration: str treatment_time_h: int donor: str | None class Config: title = "My immuno schema" ``` ### pandera ```python pandera_schema = pandera.DataFrameSchema( { "perturbation": pandera.Column( str, checks=pandera.Check.isin(["DMSO", "IFNG"]) ), "cell_type_by_model": pandera.Column( str, checks=pandera.Check.isin(["T cell", "B cell"]) ), "cell_type_by_expert": pandera.Column( str, checks=pandera.Check.isin(["T cell", "B cell"]) ), "assay_oid": pandera.Column(str, checks=pandera.Check.isin(["EFO:0008913"])), "concentration": pandera.Column(str), "treatment_time_h": pandera.Column(int), "donor": pandera.Column(str, nullable=True), }, name="My immuno schema", ) ``` ### LaminDB Features & labels are defined on the level of the database instance. You can either define a schema with required (and optional) columns. ```python ln.Record(name="DMSO").save() ln.Record(name="IFNG").save() # leverage ontologies through types ln.Record, bt.CellType, bt.ExperimentalFactor lamindb_schema = ln.Schema( name="My immuno schema", features=[ ln.Feature(name="perturbation", dtype=ln.Record).save(), ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(), ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(), ln.Feature(name="assay_oid", dtype=bt.ExperimentalFactor.ontology_id).save(), ln.Feature(name="concentration", dtype=str).save(), ln.Feature(name="treatment_time_h", dtype=int).save(), ln.Feature(name="donor", dtype=str, nullable=True).save(), ], ).save() ``` Or merely define a constraint on the feature identifier. ```python lamindb_schema_only_itype = ln.Schema( name="Allow any valid features & labels", itype=ln.Feature ) ``` ## Validate a dataframe ### pydantic ```python class DataFrameValidationError(Exception): pass def validate_dataframe(df: pd.DataFrame, model: type[pydantic.BaseModel]): errors = [] for i, row in enumerate(df.to_dict(orient="records")): try: model(**row) except pydantic.ValidationError as e: errors.append(f"row {i} failed validation: {e}") if errors: error_message = "\n".join(errors) raise DataFrameValidationError( f"DataFrame validation failed with the following errors:\n{error_message}" ) ``` ```python try: validate_dataframe(df, ImmunoSchema) except DataFrameValidationError as e: print(e) ``` To fix the validation error, we need to update the `Literal` and re-run the model definition. ```python Perturbation = Literal["DMSO", "IFNG"] CellType = Literal[ "T cell", "B cell", "CD8-positive, alpha-beta T cell" # <-- updated ] OntologyID = Literal["EFO:0008913"] class ImmunoSchema(pydantic.BaseModel): perturbation: Perturbation cell_type_by_model: CellType cell_type_by_expert: CellType assay_oid: OntologyID concentration: str treatment_time_h: int donor: str | None class Config: title = "My immuno schema" ``` ```python validate_dataframe(df, ImmunoSchema) ``` ### pandera ```python try: pandera_schema.validate(df) except pandera.errors.SchemaError as e: print(e) ``` ### LaminDB Because the term `"CD8-positive, alpha-beta T cell"` is part of the public `CellType` ontology, validation passes the first time. If validation had not passed, we could have resolved the issue simply by adding a new term to the `CellType` registry rather than editing the code. This also puts downstream data scientists into a position to update ontologies. ```python curator = ln.curators.DataFrameCurator(df, lamindb_schema) curator.validate() ``` What was the cell type validation based on? Let's inspect the `CellType` registry. ```python bt.CellType.to_dataframe() ``` The `CellType` regsitry is hierachical as it contains the Cell Ontology. ```python bt.CellType.get(name="CD8-positive, alpha-beta T cell").view_parents() ``` ## Overview of validation properties Importantly, LaminDB offers not only a `DataFrameCurator`, but also a `AnnDataCurator`, `MuDataCurator`, `SpatialDataCurator`, and `TiledbsomaCurator`. The below overview only concerns validating dataframes. ### Experience of data engineer | property | `pydantic` | `pandera` | `lamindb` | | ------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------------------------- | | define schema as code | yes, in form of a `pydantic.BaseModel` | yes, in form of a `pandera.DataFrameSchema` | yes, in form of a `lamindb.Schema` | | define schema as a set of constraints without the need of listing fields/columns/features; e.g. useful if validating 60k genes | no | no | yes | | update labels independent of code | not possible because labels are enums/literals | not possible because labels are hard-coded in `Check` | possible by adding new terms to a registry | | built-in validation from public ontologies | no | no | yes | | sync labels with ELN/LIMS registries without code change | no | no | yes | | can re-use fields/columns/features across schemas | limited via subclass | only in same Python session | yes because persisted in database | | schema modifications can invalidate previously validated datasets | yes | yes | no because LaminDB allows to query datasets that were validated with a schema version | | can use columnar organization of dataframe | no, need to iterate over potentially millions of rows | yes | yes | ### Experience of data consumer | property | `pydantic` | `pandera` | `lamindb` | | ------------------------------------------- | ----------------------------------------------------------------------------- | --------------------- | -------------------------------------- | | dataset is queryable / findable | no | no | yes, by querying for labels & features | | dataset is annotated | no | no | yes | | user knows what validation constraints were | no, because might not have access to code and doesn't know which code was run | no (same as pydantic) | yes, via `artifact.schema` | ## Annotation & queryability ### Engineer: annotate the dataset Either use the `Curator` object: ```python artifact = curator.save_artifact(key="our_datasets/dataset1.parquet") ``` If you don't expect a need for Curator functionality for updating ontologies and standardization, you can also use the `Artifact` constructor. ```python artifact = ln.Artifact.from_dataframe( df, key="our_datasets/dataset1.parquet", schema=lamindb_schema ).save() ``` ### Consumer: see annotations ```python artifact.describe() ``` ### Consumer: query the dataset ```python ln.Artifact.filter(perturbation="IFNG").to_dataframe() ``` ### Consumer: understand validation By accessing `artifact.schema`, the consumer can understand _how_ the dataset was validated. ```python artifact.schema ``` ```python artifact.schema.features.to_dataframe() ``` ## Nested data with dynamic keys We will now examine another more complex example where data is nested with potentially arbitrary (dynamic) keys. The example is inspired by the [CELLxGENE schema](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/6.0.0/schema.md#uns-dataset-metadata) where annotations are stored as dictionaries in the AnnData `.uns` slot. ```python uns_dict = ln.examples.datasets.dict_cellxgene_uns() pprint.pprint(uns_dict) ``` ### pydantic Pydantic is primed to deal with nested data. ```python class Images(pydantic.BaseModel): fullres: str hires: str class Scalefactors(pydantic.BaseModel): spot_diameter_fullres: float tissue_hires_scalef: float class Library(pydantic.BaseModel): images: Images scalefactors: Scalefactors class Spatial(pydantic.BaseModel): is_single: bool model_config = {"extra": "allow"} def __init__(self, **data): libraries = {} other_fields = {} # store all libraries under a single key for validation for key, value in data.items(): if key.startswith("library_"): libraries[key] = Library(**value) else: other_fields[key] = value other_fields["libraries"] = libraries super().__init__(**other_fields) class SpatialDataSchema(pydantic.BaseModel): organism_ontology_term_id: str spatial: Spatial validated_data = SpatialDataSchema(**uns_dict) ``` However, pydantic either requires all dictionary keys to be known beforehand to construct the Model classes or workarounds to collect all keys for a single model. ### pandera Pandera cannot validate dictionaries because it is designed for structured dataframe data. Therefore, we need to flatten the dictionary to transform it into a DataFrame: ```python def _flatten_dict(d: dict[Any, Any], parent_key: str = "", sep: str = "_"): items = [] for k, v in d.items(): new_key = f"{parent_key}{sep}{k}" if parent_key else k if isinstance(v, dict): items.extend(_flatten_dict(v, new_key, sep=sep).items()) else: items.append((new_key, v)) return dict(items) ``` ```python def create_dynamic_schema(flattened_data: dict[str, Any]): schema_dict = { "organism_ontology_term_id": pandera.Column(str), "spatial_is_single": pandera.Column(bool), } for key in flattened_data.keys(): if key.startswith("spatial_library_") and key.endswith("_images_fullres"): lib_prefix = key.replace("_images_fullres", "") schema_dict.update( { f"{lib_prefix}_images_fullres": pandera.Column(str), f"{lib_prefix}_images_hires": pandera.Column(str), f"{lib_prefix}_scalefactors_spot_diameter_fullres": pandera.Column( float ), f"{lib_prefix}_scalefactors_tissue_hires_scalef": pandera.Column( float ), } ) return pandera.DataFrameSchema(schema_dict) flattened = _flatten_dict(uns_dict) df = pd.DataFrame([flattened]) spatial_schema = create_dynamic_schema(flattened) validated_df = spatial_schema.validate(df) ``` Analogously to pydantic, pandera does not have out of the box support for dynamically named keys. Therefore, it is necessary to dynamically construct a pydantic schema. ### LaminDB Similarly, LaminDB currently requires constructing flattened dataframes to dynamically create features for the schema, which can then be used for validation with the DataFrameCurator. Future improvements are expected, including support for a dictionary-specific curator. ```python def create_dynamic_schema(flattened_data: dict[str, Any]) -> ln.Schema: features = [] for key, value in flattened_data.items(): if key == "organism_ontology_term_id": features.append(ln.Feature(name=key, dtype=bt.Organism.ontology_id).save()) elif isinstance(value, bool): features.append(ln.Feature(name=key, dtype=bool).save()) elif isinstance(value, (int, float)): features.append(ln.Feature(name=key, dtype=float).save()) else: features.append(ln.Feature(name=key, dtype=str).save()) return ln.Schema(name="Spatial data schema", features=features, coerce=True).save() flattened = _flatten_dict(uns_dict) flattened_df = pd.DataFrame([flattened]) spatial_schema = create_dynamic_schema(flattened) curator = ln.curators.DataFrameCurator(flattened_df, spatial_schema) curator.validate() ``` ```{note} Curators for scverse data structures allow for the specification of schema slots that access and validate dataframes in nested dictionary attributes like `.attrs` or `.uns`. These schema slots use colon-separated paths like `'attrs:sample'` or `'uns:spatial:images'` to target specific dataframes for validation. ``` ================================================ FILE: docs/faq/reference-field.md ================================================ --- execute_via: python --- # Where to store external links and IDs? When registering data in LaminDB, you might want to store a reference link or ID to indicate the source of the collection. We have `reference` and `reference_type` fields for this purpose, they are available for {class}`~lamindb.Collection`, {class}`~lamindb.Transform`, {class}`~lamindb.Run` and {class}`~lamindb.Record`. ```python # !pip install lamindb !lamin init --storage testreference ``` ```python import lamindb as ln ``` Let's say we have a few donor samples that came form Vendor X, in order to chase back the orders, I'd like to keep track the donor ids provided by the vendor: ```python ln.Record( name="donor 001", reference="VX984545", reference_type="Donor ID from Vendor X" ) ``` ```python !lamin delete --force testreference ``` ================================================ FILE: docs/faq/search.md ================================================ --- execute_via: python --- # How does search work? ```python from laminci.db import setup_local_test_postgres pgurl = setup_local_test_postgres() !lamin init --name benchmark_search --db {pgurl} --modules bionty --storage ./benchmark_search ``` Here we show how to perform text search on `SQLRecord` and evaluate some search queries for the {class}`bionty.CellType` ontology. ```python import lamindb as ln import bionty as bt SEARCH_QUERIES_EXACT = ( "t cell", "stem cell", "b cell", "regulatory B cell", "Be2 cell", "adipocyte", ) SEARCH_QUERIES_CONTAINS = ("t cel", "t-cel", "neural", "kidney", "kidne") TOP_N = 20 bt.CellType.import_source() ``` ```python ln.Record(name="cat[*_*]").save() ``` ## Search the registry ```python for query in SEARCH_QUERIES_EXACT: print("Query:", query) qs = bt.CellType.search(query) display(qs.to_dataframe()) assert query.lower() == qs[0].name.lower() ``` ```python for query in SEARCH_QUERIES_CONTAINS: print("Query:", query) qs = bt.CellType.search(query) display(qs.to_dataframe()) top_record = qs[0] query = query.lower() assert query in top_record.name.lower() or query in top_record.synonyms.lower() ``` Check escaping of special characters. ```python assert len(ln.Record.search("cat[")) == 1 ``` ```python assert len(ln.Record.search("*_*")) == 1 ``` ## Search the public ontology ```python ct_public = bt.CellType.public() df = ct_public.search("b cell", limit=20) assert df.iloc[0]["name"] == "B cell" df ``` ```python !docker stop pgtest && docker rm pgtest !lamin delete --force benchmark_search ``` ================================================ FILE: docs/faq/symbol-mapping.md ================================================ --- execute_via: python --- # Why should I not index datasets with gene symbols? Gene symbols are widely used for readability, particularly for visualization. However, indexing datasets with gene symbols presents challenges: - A single gene may have multiple symbols or aliases. - Gene symbols change over time (e.g., _BRCA2_ was once _FACD_) without version tracking. - The same symbol can represent different genes across species. - Symbols may be misinterpreted by software (e.g., _SEPT9_ as "September 9" in Excel). - Formatting inconsistencies exist (e.g., case sensitivity, special characters). Using unique identifiers like ENSEMBL gene IDs addresses these issues by providing: - A direct, stable mapping to genomic coordinates. - Consistency across databases. - Species-specific prefixes to prevent cross-species confusion. - Unique, permanent identifiers with standardized formatting. Storing ENSEMBL gene IDs alongside gene symbols offers readability for visualization while maintaining robust data integrity. During curation, validating against ENSEMBL gene IDs ensures accurate mapping. If only symbols are available for a dataset, you can map them to ENSEMBL IDs using {meth}`~bionty.Gene.standardize`. ```python # !pip install 'lamindb[bionty]' !lamin init --storage test-symbols --modules bionty ``` ```python import lamindb as ln import bionty as bt import numpy as np import pandas as pd import anndata as ad # create example AnnData object with gene symbols rng = np.random.default_rng(42) X = rng.integers(0, 100, size=(5, 10)) var = pd.DataFrame( index=pd.Index( [ "BRCA1", "TP53", "EGFR", "KRAS", "PTEN", "MYC", "VEGFA", "IL6", "TNF", "GAPDH", ], name="symbol", ) ) adata = ad.AnnData(X=X, var=var) adata.var ``` ```python # map Gene symbols to ENSEMBL IDs gene_mapper = bt.Gene.standardize( adata.var.index, field=bt.Gene.symbol, return_field=bt.Gene.ensembl_gene_id, return_mapper=True, organism="human", ) adata.var["ensembl_id"] = adata.var.index.map( lambda gene_id: gene_mapper.get(gene_id, gene_id) ) adata.var ``` ```python standardized_genes = bt.Gene.from_values( [ "ENSG00000141510", "ENSG00000133703", "ENSG00000111640", "ENSG00000171862", "ENSG00000204490", "ENSG00000112715", "ENSG00000146648", "ENSG00000136997", "ENSG00000012048", "ENSG00000136244", ], field=bt.Gene.ensembl_gene_id, organism="human", ) ln.save(standardized_genes) ``` This allows for validating the the `ensembl_id` against the `Gene` registry using the `bt.Gene.ensembl_gene_id` field. ```python bt.Gene.validate(adata.var["ensembl_id"], field=bt.Gene.ensembl_gene_id) ``` ```{note} Gene symbols do not map one-to-one with ENSEMBL IDs. A single gene symbol may correspond to multiple ENSEMBL IDs due to: 1. **Gene Paralogs**: Similar symbols can be shared among paralogous genes within the same species, resulting in one symbol linking to multiple ENSEMBL IDs. 2. **Pseudogenes**: Some symbols represent both functional genes and their non-functional pseudogenes, each with distinct ENSEMBL IDs. 3. **Transcript Variants**: One symbol may map to multiple ENSEMBL transcript IDs, each representing different isoforms or splice variants. {meth}`~bionty.Gene.standardize` retrieves the first match in cases of multiple hits, which is generally sufficient but not perfectly accurate. ``` ```python !lamin delete --force test-symbols ``` ================================================ FILE: docs/faq/test_notebooks.py ================================================ from pathlib import Path import nbproject_test as test import lamindb as ln def test_notebooks(): nbdir = Path(__file__).parent ln.setup.login("testuser1") ln.setup.init(storage=nbdir / "mydata") test.execute_notebooks(nbdir, write=True) ================================================ FILE: docs/faq/track-run-inputs.md ================================================ --- execute_via: python --- # Can I disable tracking run inputs? Yes, if you switch {attr}`~lamindb.core.Settings.track_run_inputs` to `False`. ```python # pip install lamindb !lamin init --storage test-run-inputs ``` ```python import lamindb as ln ``` Some test artifacts: ```python ln.track(transform=ln.Transform(key="Dummpy pipeline")) ln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), description="My image").save() ln.Artifact(ln.examples.datasets.file_mini_csv(), description="My csv").save() ``` Call `ln.track()`: ```python ln.track("Rx2s9aPTMQLY0000") ``` ## Don't track artifact as run input ```python ln.settings.track_run_inputs = False ``` ```python artifact = ln.Artifact.get(description="My image") ``` ```python artifact.cache() ``` No run inputs are linked to the current notebook run: ```python ln.Run.get(id=ln.context.run.id).input_artifacts.all() ``` ```python artifact.view_lineage() ``` ```python assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 0 ``` ## Manually track artifact as run input Let us manually track an artifact by passing `is_run_input` to either `.cache()`, `.load()` or `.open()`: ```python artifact.cache(is_run_input=True) ``` You can see the fcs artifact is now being added to the run inputs: ```python for input in ln.Run.get(id=ln.context.run.id).input_artifacts.all(): print(input) ``` ```python artifact.view_lineage() ``` ```python assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 1 ``` ## Automatically track artifacts as run input If you switch the following setting, and call to `.load()`, `.cache()` and `.open()` will track the artifact as run input. ```python ln.settings.track_run_inputs = True ``` ```python artifact = ln.Artifact.get(description="My csv") ``` ```python artifact.load() ``` ```python for input in ln.Run.get(id=ln.context.run.id).input_artifacts.all(): print(input) ``` ```python artifact.view_lineage() ``` ```python assert len(ln.Run.get(id=ln.context.run.id).input_artifacts.all()) == 2 ``` ```python !lamin delete --force test-run-inputs ``` ================================================ FILE: docs/faq/trash-archive.md ================================================ # How do I trash or archive objects? Any object in LaminDB has the following 3 levels of visibility through 3 default branches: - `main`: visible - `archive`: excluded from query & search - `trash`: excluded from query & search, scheduled for deletion Let's look at an example for an `Artifact` object while noting that the same applies to any other `SQLRecord`. ```python import lamindb as ln import pandas as pd df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) artifact = ln.Artifact.from_dataframe(df, key="dataset.parquet").save() ``` An artifact is by default created on the `main` branch. ```python assert artifact.branch.name == "main" ln.Artifact.filter(key="dataset.parquet").to_dataframe() # the artifact shows up ``` If you delete an artifact, it gets moved into the `trash` branch. ```python artifact.delete() assert artifact.branch.name == "trash" ``` Artifacts in trash won't show up in queries with default arguments: ```python ln.Artifact.filter(key="dataset.parquet").to_dataframe() # the artifact does not show up ``` You can query for them by adding the `trash` branch to the filter. ```python ln.Artifact.filter(key="dataset.parquet", branch__name="trash").to_dataframe() # the artifact shows up ``` You can restore an artifact from trash: ```python artifact.restore() ln.Artifact.filter(key="dataset.parquet").to_dataframe() # the artifact shows up ``` ================================================ FILE: docs/faq/validate-fields.md ================================================ --- execute_via: python --- # Django field validation [Django field validation](https://docs.djangoproject.com/en/5.1/ref/validators/) are enabled for models that inherit the `ValidateFields` class. ```python # pip install lamindb !lamin init --storage ./test-django-validation ``` ```python import lamindb as ln from lamindb.core.exceptions import FieldValidationError ``` ```python try: ln.Reference(name="my ref", doi="abc.ef", url="myurl.com") except FieldValidationError as e: print(e) ``` ```python !lamin delete --force test-django-validation ``` ================================================ FILE: docs/faq.md ================================================ # FAQ ```{toctree} :maxdepth: 1 faq/pydantic-pandera faq/idempotency faq/acid faq/track-run-inputs faq/curate-any faq/import-modules faq/reference-field faq/trash-archive faq/keep-artifacts-local faq/validate-fields faq/symbol-mapping faq/search ``` ================================================ FILE: docs/guide.md ================================================ # Guide ```{toctree} :hidden: :caption: "Overview" README ``` ```{toctree} :hidden: :caption: "How to" query-search track organize manage-changes manage-ontologies sync ``` ```{toctree} :hidden: :caption: Use cases lightning ``` ```{toctree} :hidden: :caption: Other topics faq storage ``` ================================================ FILE: docs/index.md ================================================ ```{include} ../README.md :start-line: 0 :end-line: 5 ``` ```{toctree} :maxdepth: 1 :hidden: guide api changelog ``` ================================================ FILE: docs/lightning.md ================================================ # Lightning This guide offers more context on the {class}`lamindb.integrations.lightning.Checkpoint` callback. For end-to-end examples, see the following guides: - {doc}`docs:clearml` - {doc}`docs:wandb` - {doc}`docs:mlflow` ## Quickstart Pass `ll.Checkpoint` and a logger into `Trainer`. The logger is what gives checkpoints meaningful, namespaced artifact keys — without it, keys fall back to a bare `checkpoints/` prefix (or just the run UID when `ln.track()` is active). Any logger implementing Lightning's `Logger` interface works (`TensorBoardLogger`, `WandbLogger`, `MLFlowLogger`, `CSVLogger`, etc.). We use `TensorBoardLogger` in the examples below. ```python import lamindb as ln import lightning.pytorch as pl from lightning.pytorch.loggers import TensorBoardLogger from lamindb.integrations import lightning as ll ln.track() logger = TensorBoardLogger(save_dir="logs") checkpoint = ll.Checkpoint(monitor="val_loss", mode="min", save_top_k=3) trainer = pl.Trainer( max_epochs=10, callbacks=[checkpoint], logger=logger, ) trainer.fit(model, datamodule=datamodule) ``` After training, each saved checkpoint file is a LaminDB artifact: ```python checkpoint.last_checkpoint_artifact checkpoint.last_checkpoint_artifact.key # e.g. "logs/lightning_logs/2r5pIRnK7z0q/checkpoints/epoch=0-step=100.ckpt" checkpoint.checkpoint_key_prefix # e.g. "logs/lightning_logs/2r5pIRnK7z0q/checkpoints" ``` ### How is a run organized? A Lightning `Trainer` coordinates three concerns during training: 1. **Logger** — writes metrics (loss curves, learning rate, etc.) to a dashboard directory. The logger determines the local directory layout: `{save_dir}/{name}/{version}/`. 2. **ModelCheckpoint** — saves model snapshots (`.ckpt` files) into a `checkpoints/` subdirectory underneath the logger's directory. 3. **SaveConfigCallback** — when using `LightningCLI`, writes the fully resolved `config.yaml` into the logger's directory so you can reproduce exactly which hyperparameters were used. All three share the same directory tree. The logger creates it, the checkpoint callback writes into it, and the config callback stores beside it: ``` logs/ # logger save_dir lightning_logs/ # logger name version_0/ # logger version (local filesystem) events.out.tfevents.* # ← logger output (TensorBoard) config.yaml # ← SaveConfigCallback checkpoints/ epoch=0-step=100.ckpt # ← ModelCheckpoint epoch=1-step=200.ckpt hparams.yaml # ← auto-generated by Lightning ``` LaminDB's integration replaces `ModelCheckpoint` with `ll.Checkpoint` and Lightning's `SaveConfigCallback` with `ll.SaveConfigCallback`. Checkpoint files, the config, and `hparams.yaml` become `lamindb.Artifact` records with lineage tracking and optional feature annotations. Note that artifact keys in LaminDB do **not** mirror the local directory layout exactly — the callback uses the LaminDB run UID instead of Lightning's auto-incrementing `version_N` directory by default. See [How artifact keys are derived](#how-artifact-keys-are-derived) for details. ### Which kind of artifacts? `Checkpoint` saves three kinds of artifacts: | Kind | Example key | When | | ------------ | ------------------------------------- | ---------------------------------------- | | `checkpoint` | `…/checkpoints/epoch=0-step=100.ckpt` | Every time Lightning writes a checkpoint | | `config` | `…/config.yaml` | When using `ll.SaveConfigCallback` | | `hparams` | `…/checkpoints/hparams.yaml` | When Lightning generates it | Checkpoints and `hparams.yaml` live under the `checkpoints/` subdirectory, while the config sits directly under the base prefix. The callback tracks the latest artifact of each kind: ```python checkpoint.last_checkpoint_artifact checkpoint.last_config_artifact checkpoint.last_hparams_artifact checkpoint.last_artifact_event ``` ### How is data lineage tracked? When a run is being tracked with `ln.track()`: - `checkpoint` artifacts are recorded as **run outputs** — they are produced by the training run. - `config` artifacts are recorded as **run inputs** — the resolved config is part of the run specification. - `hparams.yaml` is saved as an artifact but not linked as a run input. ## How are artifact keys derived? LaminDB artifact keys are **not** necessarily a mirror of the local filesystem layout. Lightning uses auto-incrementing version directories (`version_0`, `version_1`, …) on disk, but these are meaningless as artifact identifiers — they depend on what already exists locally and cannot reliably distinguish runs across machines. Instead, when `ln.track()` is active, the callback uses the **LaminDB run UID** as the version segment by default (`run_uid_is_version=True`). This guarantees that every tracked run produces unique artifact keys regardless of local state. The base prefix is determined by priority: | Scenario | Base prefix | | ------------------------ | -------------------------------------- | | `dirpath` set (± logger) | `{dirpath}/{run_uid}` | | No `dirpath` + logger | `{save_dir_basename}/{name}/{run_uid}` | | No `dirpath` + no logger | `{run_uid}` | `run_uid` above refers to the active LaminDB run UID (from `ln.context.run.uid`). When no run is tracked or `run_uid_is_version=False`, the callback falls back to the logger's own version (e.g. `version_0`) or omits the segment entirely. **Checkpoint & hparams keys:** | Scenario | LaminDB key pattern | | ----------------------------- | ------------------------------------------------------------- | | Logger present (recommended) | `{save_dir_basename}/{name}/{run_uid}/checkpoints/{filename}` | | No logger, explicit `dirpath` | `{dirpath}/{run_uid}/checkpoints/{filename}` | | No logger, no `dirpath` | `{run_uid}/checkpoints/{filename}` | **Config keys:** | Scenario | Key pattern | | ----------------------------- | -------------------------------------------------- | | Logger present | `{save_dir_basename}/{name}/{run_uid}/config.yaml` | | No logger, explicit `dirpath` | `{dirpath}/{run_uid}/config.yaml` | | No logger, no `dirpath` | `{run_uid}/config.yaml` | For example, with `TensorBoardLogger(save_dir="logs")` and a tracked run: ``` logs/lightning_logs/2r5pIRnK7z0q/ # base prefix ({save_dir_basename}/{name}/{run_uid}) config.yaml # ← config artifact checkpoints/ epoch=0-step=100.ckpt # ← checkpoint artifact hparams.yaml # ← hparams artifact ``` ### Opting out of run UID keys Pass `run_uid_is_version=False` to fall back to the logger-managed version directory, matching Lightning's local layout more closely: ```python checkpoint = ll.Checkpoint( monitor="val_loss", run_uid_is_version=False, ) ``` With this setting, the key uses the logger's version (`version_0`, etc.) instead of the run UID. This is mainly useful when you don't call `ln.track()` or when you want artifact keys that exactly mirror the local directory tree. ### Why run UIDs instead of `version_N`? Lightning's auto-incrementing `version_N` depends on what directories already exist at `save_dir`. Two runs on different machines — or the same machine after clearing `logs/` — can both produce `version_0`. With `run_uid_is_version=True` (the default), each tracked run gets a unique prefix derived from the Lamin run, so artifact keys never collide. ## Use with the Lightning CLI The Lightning CLI resolves a YAML config into concrete model and data module arguments. To also store that resolved config as a LaminDB artifact, pass `ll.SaveConfigCallback` in your training script and declare the trainer, logger, callbacks, model, and data in a config file. **`config.yaml`** ```yaml trainer: max_epochs: 10 logger: class_path: lightning.pytorch.loggers.TensorBoardLogger init_args: save_dir: logs callbacks: - class_path: lamindb.integrations.lightning.Checkpoint init_args: monitor: val/loss mode: min save_top_k: 3 model: learning_rate: 1.0e-3 data: batch_size: 64 ``` **`train.py`** ```python import lamindb as ln from lightning.pytorch.cli import LightningCLI from lamindb.integrations.lightning import SaveConfigCallback ln.track() def cli_main() -> None: LightningCLI( model_class=MyModel, datamodule_class=MyDataModule, save_config_callback=SaveConfigCallback, ) if __name__ == "__main__": cli_main() ``` ```bash python train.py fit --config config.yaml ``` `ll.SaveConfigCallback` extends Lightning's built-in version: it writes the local file as usual and then delegates to whichever `ArtifactPublishingModelCheckpoint` is registered on the trainer to persist the config as an artifact. ## Annotating with features Attach custom run-level and artifact-level feature values through `features=`: ```python logger = TensorBoardLogger(save_dir="logs") checkpoint = ll.Checkpoint( monitor="val_loss", features={ "run": {"training_framework": "lightning"}, "artifact": {"dataset_version": "2026-03"}, }, ) trainer = pl.Trainer(callbacks=[checkpoint], logger=logger) ``` Feature names must already exist in Lamin. The callback can also auto-track standard Lightning fields. Create the corresponding LaminDB features once: ```python ll.save_lightning_features() ``` This enables auto-features: - Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode` - Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`, `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode` ## Extending the callback ### Subclass `Checkpoint` Subclass when you want to keep LaminDB persistence and additionally notify an external system after each artifact is saved: ```python from lamindb.integrations import lightning as ll from my_model_registry import ModelRegistry class ModelRegistryCheckpoint(ll.Checkpoint): """Register each checkpoint in an external model registry.""" def __init__(self, *args, registry_project: str, **kwargs): super().__init__(*args, **kwargs) self.registry_project = registry_project self.model_registry = ModelRegistry() def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None: if event.kind == "checkpoint": # register the model in your external system self.model_registry.register( project=self.registry_project, model_uri=event.storage_uri, metadata={"lamin_key": event.key}, ) logger = TensorBoardLogger(save_dir="logs") checkpoint = ModelRegistryCheckpoint( registry_project="my-project", monitor="val_loss", save_top_k=3, ) trainer = pl.Trainer(callbacks=[checkpoint], logger=logger) trainer.fit(model, datamodule=datamodule) ``` Each event gives you: - `event.kind`: `"checkpoint"`, `"config"`, or `"hparams"` - `event.artifact`: the persisted LaminDB artifact - `event.key`: the LaminDB artifact key - `event.local_path`: the local file path Lightning wrote - `event.storage_uri`: the stable storage URI for downstream systems ### Attach an observer Observers are useful when you want composition instead of inheritance: ```python from lamindb.integrations import lightning as ll class ArtifactLogger: def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None: print(event.kind, event.storage_uri) def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None: print("removed", event.key) logger = TensorBoardLogger(save_dir="logs") checkpoint = ll.Checkpoint( monitor="val_loss", artifact_observers=[ArtifactLogger()], ) trainer = pl.Trainer(callbacks=[checkpoint], logger=logger) trainer.fit(model, datamodule=datamodule) ``` Observers receive the same events that subclasses see. ## Integrating other systems To register checkpoints in another system (e.g. ClearML, Weights & Biases, MLflow, Neptune, or Comet), use the artifact lifecycle events rather than re-deriving paths from Lightning internals. The key hand-off value is `event.storage_uri`, which resolves to the persisted artifact location. `event.artifact` gives you the full LaminDB record when you need metadata beyond the URI. ================================================ FILE: docs/manage-changes.md ================================================ # Manage changes Managing changes in LaminDB is largely analogous to managing code changes via branching in git and Pull Requests in GitHub. For usage examples, read the `Examples` section of the {class}`~lamindb.Branch` class. ================================================ FILE: docs/manage-ontologies.md ================================================ --- execute_via: python --- # Manage biological ontologies This guide shows how to manage ontologies for basic biological entities. ```{raw} html ``` If instead you're interested in - accessing public ontologies, see {doc}`docs:public-ontologies` - flexible bio registries for the wetlab (a LIMS), see {class}`~lamindb.Record` and {doc}`docs:records` ```python # pip install lamindb !lamin init --storage ./test-ontologies --modules bionty ``` ## Import records from public ontologies Let's first populate our {class}`~bionty.CellType` registry with the default public ontology (Cell Ontology). ```python import lamindb as ln import bionty as bt # inspect the available public ontology versions bt.Source.to_dataframe() ``` ```python # inspect which ontology version we're about to import bt.Source.get(entity="bionty.CellType", currently_used=True) ``` ```python # populate the database with a public ontology bt.CellType.import_source() ``` This is now your in-house cell type ontology in which you can add & modify records as you like. It's a registry just like `Artifact` or `Record`. ```python # all public cell types are now available in LaminDB bt.CellType.to_dataframe() ``` ```python # let's also populate the Gene registry with human and mouse genes bt.Gene.import_source(organism="human") bt.Gene.import_source(organism="mouse") ``` ## Access records in in-house registries Search key words: ```python bt.CellType.search("gamma-delta T").to_dataframe().head(2) ``` Or look up with auto-complete: ```python cell_types = bt.CellType.lookup() hsc_record = cell_types.hematopoietic_stem_cell hsc_record ``` Filter by fields and relationships: ```python gdt_cell = bt.CellType.get(ontology_id="CL:0000798", created_by__handle="testuser1") gdt_cell ``` View the ontological hierarchy: ```python gdt_cell.view_parents() # pass with_children=True to also view children ``` Or access the parents and children directly: ```python gdt_cell.parents.to_dataframe() ``` ```python gdt_cell.children.to_dataframe() ``` It is also possible to recursively query parents or children, getting direct parents (children), their parents, and so forth. ```python gdt_cell.query_parents().to_dataframe() ``` ```python gdt_cell.query_children().to_dataframe() ``` ## Construct custom hierarchies of records You can add a child of a parent record: ```python # register a new cell type my_celltype = bt.CellType(name="my new T-cell subtype").save() # specify "gamma-delta T cell" as a parent my_celltype.parents.add(gdt_cell) # visualize hierarchy my_celltype.view_parents(distance=3) ``` ## Create new records When accessing datasets, one often encounters bulk references to entities that might be corrupted or standardized using different standardization schemes. Let's consider an example based on an `AnnData` object, in the `cell_type` annotations of this `AnnData` object, we find 4 references to cell types: ```python adata = ln.examples.datasets.anndata_with_obs() adata.obs.cell_type.value_counts() ``` We'd like to load the corresponding records in our in-house registry to annotate a dataset. To this end, you'll typically use {class}`~lamindb.models.CanCurate.from_values`, which will both validate & retrieve records that match the values. ```python cell_types = bt.CellType.from_values(adata.obs.cell_type) cell_types ``` Logging informed us that 3 cell types were validated. Since we loaded these records at the same time, we could readily use them to annotate a dataset. :::{dropdown} What happened under-the-hood? `.from_values()` performs the following look ups: 1. If registry records match the values, load these records 2. If values match synonyms of registry records, load these records 3. If no record in the registry matches, attempt to load records from a public ontology 4. Same as 3. but based on synonyms No records will be returned if all 4 look ups are unsuccessful. Sometimes, it's useful to treat validated records differently from non-validated records. Here is a way: ``` original_values = ["gut", "gut2"] inspector = bt.Tissue.inspect(original_values) records_from_validated_values = bt.Tissue.from_values(inspector.validated) ``` ::: Alternatively, we can retrieve records based on ontology ids: ```python adata.obs.cell_type_id.unique().tolist() ``` ```python bt.CellType.from_values(adata.obs.cell_type_id, field=bt.CellType.ontology_id) ``` ## Validate & standardize Simple validation of an iterable of values works like so: ```python bt.CellType.validate(["fat cell", "blood forming stem cell"]) ``` Because these values don't comply with the registry, they're not validated! You can easily convert these values to validated standardized names based on synonyms like so: ```python bt.CellType.standardize(["fat cell", "blood forming stem cell"]) ``` Alternatively, you can use `.from_values()`, which will only ever return validated records and automatically standardize under-the-hood: ```python bt.CellType.from_values(["fat cell", "blood forming stem cell"]) ``` If you are now sure what to do, use `.inspect()` to get instructions: ```python bt.CellType.inspect(["fat cell", "blood forming stem cell"]); ``` We can also add new synonyms to a record: ```python hsc_record.add_synonym("HSC") ``` And when we encounter this synonym as a value, it will now be standardized using synonyms-lookup, and mapped on the correct registry record: ```python bt.CellType.standardize(["HSC"]) ``` A special synonym is `.abbr` (short for abbreviation), which has its own field and can be assigned via: ```python hsc_record.set_abbr("HSC") ``` You can create a lookup object from the `.abbr` field: ```python cell_types = bt.CellType.lookup("abbr") cell_types.hsc ``` The same workflow works for all of `bionty`'s registries. ## Manage ontologies across organisms Several registries are organism-aware (has a `.organism` field), for instance, {class}`~bionty.Gene`. In this case, API calls that interact with multi-organism registries require an `organism` argument when there's ambiguity. For instance, when validating gene symbols: ```python bt.Gene.validate(["TCF7", "ABC1"], organism="human") ``` In contrary, working with Ensembl Gene IDs doesn't require passing `organism`, as there's no ambiguity: ```python bt.Gene.validate( ["ENSG00000000419", "ENSMUSG00002076988"], field=bt.Gene.ensembl_gene_id ) ``` When working with the same organism throughout your analysis/workflow, you can omit the `organism` argument by configuring it globally: ```python bt.settings.organism = "mouse" bt.Gene.from_source(symbol="Ap5b1") ``` ## Track ontology versions Under-the-hood, source ontology versions are automatically tracked for each registry: ```python bt.Source.filter(currently_used=True).to_dataframe() ``` Each record is linked to a versioned public source (if it was created from public): ```python hepatocyte = bt.CellType.get(name="hepatocyte") hepatocyte.source ``` ## Create records from a specific ontology version By default, new records are imported or created from the `"currently_used"` public sources which are configured during the instance initialization, e.g.: ```python bt.Source.filter(entity="bionty.Phenotype", currently_used=True).to_dataframe() ``` Sometimes, the default source doesn't contain the ontology term you are looking for. You can then specify to create a record from a non-default source. For instance, we can use the `ncbitaxon` ontology: ```python source = bt.Source.get(entity="bionty.Organism", name="ncbitaxon") source ``` ```python # validate against the NCBI Taxonomy bt.Organism.validate( ["iris setosa", "iris versicolor", "iris virginica"], source=source ) ``` ```python # since we didn't seed the Organism registry with the NCBITaxon public ontology # we need to save the records to the database records = bt.Organism.from_values( ["iris setosa", "iris versicolor", "iris virginica"], source=source ).save() # now we can query a iris organism and view its parents and children bt.Organism.get(name="iris").view_parents(with_children=True) ``` ## Access any Ensembl genes Genes from all Ensembl versions and organisms can be accessed, even though they are not yet present in the `bt.Source` registry. For instance, if you want to use `rabbit` genes from Ensembl version `release-103`: ```python # pip install pymysql import bionty as bt # automatically download genes for a new organism gene_ontology = bt.base.Gene(source="ensembl", organism="rabbit", version='release-103') # register the new source in lamindb gene_ontology.register_source_in_lamindb() # now you can start using this source # import all genes from this source to your Gene registry source = bt.Source.get(entity="bionty.Gene", name="ensembl", organism="rabbit", version="release-103") bt.Gene.import_source(source=source) ``` ================================================ FILE: docs/organize.md ================================================ # Organize datasets ```{toctree} :maxdepth: 1 :hidden: curate ``` This guide walks through organizing datasets using files & folders, database relationships, and versioned collections. ## Via files & folders You can use LaminDB like a file system. Similar to AWS S3, you organize artifacts into virtual folders using `/`-separated keys. To ingest a single file into a `project1/` folder, you'd call: ```python artifact1 = ln.Artifact("./dataset.csv", key="project1/dataset1.csv").save() ``` For convenience, if you want to create an artifact for every file in a directory, use {meth}`~lamindb.Artifact.from_dir`: ```python artifacts = ln.Artifact.from_dir("./project1/").save() ``` You can then query for all artifacts in the `"./project1/"` folder via: ```python artifacts = ln.Artifact.filter(key__startswith="project1/") ``` Unlike a regular file system, every artifact is versioned and comes with rich metadata. :::{dropdown} What if I do not care about the metadata and version of every file in a folder? In some cases a folder _is_ the dataset and you don't need fine-grained information for every file. In this scenario, save the entire directory as a single artifact: ```python ln.Artifact("./folder_abc", key="folder_abc").save() ``` ::: ## Via relationships in the database ### Annotating with projects What if an artifact is relevant to multiple projects? A dataset that's in the `project1/` folder cannot **also** reside in a `project2/` folder. You can solve this problem with the `artifact.projects` relationship that links the {class}`~lamindb.Project` to {class}`~lamindb.Artifact`: image Here is how to annotate one artifact with two projects: ```python project1 = ln.Project(name="Project 1").save() # create project 1 project2 = ln.Project(name="Project 2").save() # create project 2 artifact1.projects.add(project1, project2) # annotate artifact1 ``` This allows you to retrieve `artifact1` by querying any project it belongs to: ```python artifacts_in_project1 = ln.Artifact.filter(projects=project1) artifacts_in_project2 = ln.Artifact.filter(projects=project2) ``` Here, `artifact1` is part of both query results. :::{dropdown} Three additional advantages to using related registries rather than folder structures. 1. Projects can be richly annotated (e.g., with start/end dates, parent projects, or member roles). 2. You no longer need to rely on fragile file paths. If a folder is renamed, path-based retrieval breaks, but a project query by `uid` will always work.[^protectproject] 3. You can run a constrained query or search against all projects in your database rather than trying to narrow a search to folder names. ::: ### Annotating with labels You can annotate with other entity types, not just projects. LaminDB offers two main classes for this: {class}`~lamindb.Record` for metadata records and {class}`~lamindb.ULabel` for simple labels, which are both link to artifacts: image Here is how to annotate with a ulabel and with a sample record: ```python ulabel1 = ln.ULabel(name="raw_data").save() # create a ulabel artifact1.ulabels.add(ulabel1) # annotate artifact1 sample_type = ln.Record( # create a record type "Samples" name="Samples", is_type=True ).save() record1 = ln.Record( # create a sample record name="My sample", features={"gc_content": 0.5} ).save() artifact1.records.add(record1) # annnotate artifact1 ``` You can use records and ulabels alongside entity types in modules such as {mod}`bionty`: ```python import bionty as bt cell_type1 = bt.CellType.from_source( name="T cell" # create a cell type from a public ontology ).save() artifact1.cell_types.add(cell_type1) # annotate artifact1 ``` ### Annotating with features To annotate with non-categorical data types or to disambiguate categorical annotations, use {class}`~lamindb.Feature` objects. image Here is how to define features and annotate an artifact with feature values: ```python exp_type = ln.Record.get(name="Experiments") # query the entity type `Experiments` ln.Feature(name="gc_content", dtype=float).save() # define a feature with dtype float ln.Feature(name="experiment", dtype=exp_type).save() # define a feature with dtype `Experiments` artifact.features.set_values({ "gc_content": 0.55, # validated to be a float "experiment": "Experiment 1", # validated to exist under the `Experiments` record type }) ``` When you work with structured data formats like `DataFrame` or `AnnData`, it often makes sense to validate the content of their features. After validation, the parsed feature values are automatically used for annotation. The easiest way is to use validation and auto-annotation is the built-in schema `"valid_features"`: ```python # validate columns in the dataframe and map them on features # auto-annotate with parsed metadata ln.Artifact.from_dataframe(df, schema="valid_features").save() ``` Below is an example from the {doc}`docs:tutorial` illustrating how you get e.g. cell type, treatment, and assay annotations based on a dataframe's content. You can read more on this in {doc}`/curate`. ### Annotating with data-lineage When you call {func}`~lamindb.track` or decorate a function with {func}`~lamindb.flow`, you automatically annotate artifacts with {class}`~lamindb.Run` and {class}`~lamindb.Transform` objects. image Here is how: ```{eval-rst} .. literalinclude:: scripts/run_track_and_finish.py :language: python ``` Note that you can pass `project` to {func}`~lamindb.track` to auto-annotate all objects that are created in a run with a project label. Read more in {doc}`/track`. ### Overview of auto-generated annotations The {class}`~lamindb.Artifact` registry has simple fields (such as `description`, `created_at`, `size`) and related fields (such as `projects`, `created_by`, `storage`). Many of these fields are automatically populated and you can use them to retrieve sets of artifacts. All other registries link to {class}`~lamindb.Artifact` to provide context for finding, querying, validating, and managing artifacts.[^starsnowflake] :::{dropdown} Can you give me some example queries? Here are examples leveraging auto-populated fields. ```python artifacts = ln.Artifact.filter( created_at__gt="2023-06-24", # created after June 24th, 2023 size__lt=1e9, # smaller than 1GB suffix=".parquet", # with a .parquet suffix n_observations__gt=1000, # with more than 1000 observations n_files__gt=1000, # folder-like artifacts with more than 1000 files otype="DataFrame", # that are DataFrames created_on__name="my-branch", # created on a specific branch or environment created_by__handle="falexwolf", # created by user with handle falexwolf run=run, # created by a specific run transform__name="my-script.py", # created by a specific script/notebook ) ``` ::: ## Versioned collections of artifacts If you want to group artifacts by metadata and version the entire set, use {class}`~lamindb.Collection`. image Unlike during annotation, you have to pass an entire group of artifacts to a `Collection` constructor: ```python collection = ln.Collection([artifact1, artifact2], key="my_data_release").save() ``` And unlike the folder-based or annotation-based sets of artifacts — which can change as artifacts are added or removed — a collection guarantees an exact, immutable set of artifacts. Artifacts are versioned based on the hash of their content. Collections are versioned based on the top-level hash of their artifact hashes. If you use the {meth}`~lamindb.Collection.append` method, a new version of the collection is created, and the old version is left unchanged: ```python collection_v2 = collection.append(artifact3) ``` While collections are indirectly annotated through the annotations of the artifacts they contain, you can also add collection-level annotations. Like artifacts, collections link to projects, runs, ulabels, records, and most other registries. [^starsnowflake]: You can consider the SQL table underlying {class}`~lamindb.Artifact` your _fact table_ and all other tables for other entities your _dimension tables_ in a star or Snowflake schema ([see Wikipedia](https://en.wikipedia.org/wiki/Fact_table)). [^protectproject]: The project annotation of the artifact is protected against the deletion of the project. If a user with necessary rights attempts to delete the project, they will get an error. ================================================ FILE: docs/pertdb.md ================================================ # `pertdb` ```{eval-rst} .. automodule:: pertdb ``` ================================================ FILE: docs/query-search.md ================================================ # Query, search & stream ```{toctree} :maxdepth: 1 registries arrays ``` ================================================ FILE: docs/registries.md ================================================ --- execute_via: python --- # Query & search registries This guide walks through different ways of querying & searching LaminDB registries. To understand the underlying cross-linking of objects in the SQL database, read {doc}`organize`. If you already have a set of artifacts, e.g. in the form of parquet files, and you'd like to now query/stream their (validated) content, read {doc}`arrays`. ```python # initialize a test database to run examples !lamin init --storage ./test-registries --modules bionty ``` Let's start by creating a few exemplary datasets: ```python import lamindb as ln ln.Artifact(ln.examples.datasets.file_fastq(), key="raw/my_fastq.fastq.gz").save() ln.Artifact(ln.examples.datasets.file_jpg_paradisi05(), key="my_image.jpg").save() ln.Artifact.from_dataframe(ln.examples.datasets.df_iris(), key="iris.parquet").save() ln.examples.datasets.mini_immuno.save_mini_immuno_datasets() ``` ## Get an overview The easiest way to get an overview over all artifacts is by typing {meth}`~lamindb.Artifact.to_dataframe`, which returns the 100 latest artifacts in the {class}`~lamindb.Artifact` registry. ```python ln.Artifact.to_dataframe() ``` You can include features. ```python ln.Artifact.to_dataframe(include="features") ``` You can include fields from other registries. ```python ln.Artifact.to_dataframe( include=[ "created_by__name", "records__name", "cell_types__name", "schemas__itype", ] ) ``` You can also get an overview of the entire database. ```python ln.view() ``` ## Auto-complete objects For registries with less than 100k objects, auto-completing a `Lookup` object is the most convenient way of finding a record. ```python records = ln.Record.lookup() ``` With auto-complete, we find a record: ```python experiment_1 = records.experiment_1 experiment_1 ``` This works for any {class}`~lamindb.models.BaseSQLRecord` class, e.g., also for plugin `bionty`. ```python import bionty as bt cell_types = bt.CellType.lookup() ``` ## Get one object {meth}`~lamindb.models.BaseSQLRecord.get` errors if none or more than one matching objects are found. ```python ln.Record.get(experiment_1.uid) # by uid ln.Record.get(name="Experiment 1") # by field ``` ## Query objects by fields Use {meth}`~lamindb.models.BaseSQLRecord.filter` to query all artifacts by the `suffix` field: ```python qs = ln.Artifact.filter(suffix=".h5ad") qs ``` This returns a {class}`~lamindb.models.QuerySet`, which lazily references the set of {class}`~lamindb.models.BaseSQLRecord` objects that matches the filter statement. You can iteratively filter a queryset: ```python qs = qs.filter(records__name="Experiment 1") ``` To access the results encoded in a queryset, call: - {meth}`~lamindb.models.BasicQuerySet.to_dataframe`: A pandas `DataFrame` with each record in a row. - {meth}`~lamindb.models.BasicQuerySet.one`: Exactly one record. Will raise an error if there is none. Is equivalent to the `.get()` method shown above. - {meth}`~lamindb.models.BasicQuerySet.one_or_none`: Either one record or `None` if there is no query result. Alternatively, - use the `QuerySet` as an iterator - get individual objects via `qs[0]`, `qs[1]` For example: ```python qs.to_dataframe() ``` Note that the `SQLRecord` classes in LaminDB are Django Models and any [Django query](https://docs.djangoproject.com/en/stable/topics/db/queries/) works. ## Query objects by features The `Artifact`, `Record`, and `Run` registries can be queried by features. ```python ln.Artifact.filter(perturbation="DMSO").to_dataframe(include="features") ``` You can also query by passing a `Feature` object, which is useful to disambiguate feature names. ```python perturbation = ln.Feature.get(name="perturbation") # can optionally pass a feature type to disambiguate ln.Artifact.filter(perturbation == "DMSO") # note this is now an expression using the == syntax ``` Just like for fields holding dictionary values, you can query for dictionary keys in features whose `dtype` is `dict`: ```python ln.Artifact.filter(study_metadata__detail1="123").to_dataframe(include="features") ``` ```python ln.Artifact.filter(study_metadata__detail2=2).to_dataframe(include="features") ``` You can query for whether a dataset is annotated or not annotated by a feature. ```python ln.Artifact.filter(perturbation__isnull=True).to_dataframe(include="features") ``` ```python ln.Artifact.filter(perturbation__isnull=False).to_dataframe(include="features") ``` ## Query runs by parameters Here is an example for querying by parameters: {ref}`track-run-parameters`. ## Search for objects You can search every registry via {meth}`~lamindb.models.SQLRecord.search`. For example, the `Artifact` registry. ```python ln.Artifact.search("iris").to_dataframe() ``` Here is more background on search and examples for searching the entire cell type ontology: {doc}`/faq/search` ## Query related registries Django has a double-under-score syntax to filter based on related tables. This syntax enables you to traverse several layers of relations and leverage different comparators. ```python ln.Artifact.filter(created_by__handle__startswith="testuse").to_dataframe() ``` The filter selects all artifacts based on the users who ran the generating notebook. Under the hood, in the SQL database, it's joining the artifact table with the user table. Another typical example is querying all datasets that measure a particular feature. For instance, which datasets measure `"CD8A"`. Here is how to do it: ```python cd8a = bt.Gene.get(symbol="CD8A") # query for all feature sets that contain CD8A schemas_with_cd8a = ln.Schema.filter(genes=cd8a) # get all artifacts ln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe() ``` Instead of splitting this across three queries, the double-underscore syntax allows you to define a path for one query. ```python ln.Artifact.filter(schemas__genes__symbol="CD8A").to_dataframe() ``` ## Filter operators You can qualify the type of comparison in a query by using a comparator. Below follows a list of the most import, but Django supports about [two dozen field comparators](https://docs.djangoproject.com/en/stable/ref/models/querysets/#field-lookups) `field__comparator=value`. ### and ```python ln.Artifact.filter(suffix=".h5ad", records=experiment_1).to_dataframe() ``` ### less than/ greater than Or subset to artifacts greater than 10kB. Here, we can't use keyword arguments, but need an explicit where statement. ```python ln.Artifact.filter(records=experiment_1, size__gt=1e4).to_dataframe() ``` ### in ```python ln.Artifact.filter(suffix__in=[".jpg", ".fastq.gz"]).to_dataframe() ``` ### order by ```python ln.Artifact.filter().order_by("created_at").to_dataframe() ``` ```python # reverse ordering ln.Artifact.filter().order_by("-created_at").to_dataframe() ``` ```python ln.Artifact.filter().order_by("key").to_dataframe() ``` ```python # reverse ordering ln.Artifact.filter().order_by("-key").to_dataframe() ``` ### contains ```python ln.Transform.filter(description__contains="search").to_dataframe().head(5) ``` And case-insensitive: ```python ln.Transform.filter(description__icontains="Search").to_dataframe().head(5) ``` ### startswith ```python ln.Transform.filter(description__startswith="Query").to_dataframe() ``` ### or ```python ln.Artifact.filter(ln.Q(suffix=".jpg") | ln.Q(suffix=".fastq.gz")).to_dataframe() ``` ### negate/ unequal ```python ln.Artifact.filter(~ln.Q(suffix=".jpg")).to_dataframe() ``` ================================================ FILE: docs/scripts/curate_anndata_flexible.py ================================================ import lamindb as ln ln.examples.datasets.mini_immuno.define_features_labels() adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") artifact = ln.Artifact.from_anndata( adata, key="examples/mini_immuno.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs", ).save() artifact.describe() ================================================ FILE: docs/scripts/curate_anndata_uns.py ================================================ import lamindb as ln ln.examples.datasets.mini_immuno.define_features_labels() adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") schema = ln.Schema.get(name="Study metadata schema") artifact = ln.Artifact.from_anndata( adata, schema=schema, key="examples/mini_immuno_uns.h5ad" ) artifact.describe() ================================================ FILE: docs/scripts/curate_dataframe_attrs.py ================================================ import lamindb as ln from .define_schema_df_metadata import study_metadata_schema df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") schema = ln.Schema( features=[ln.Feature(name="perturbation", dtype="str").save()], slots={"attrs": study_metadata_schema}, otype="DataFrame", ).save() curator = ln.curators.DataFrameCurator(df, schema=schema) curator.validate() artifact = curator.save_artifact(key="examples/df_with_attrs.parquet") artifact.describe() ================================================ FILE: docs/scripts/curate_dataframe_external_features.py ================================================ import lamindb as ln from datetime import date df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") temperature = ln.Feature(name="temperature", dtype=float).save() date_of_study = ln.Feature(name="date_of_study", dtype=date).save() external_schema = ln.Schema(features=[temperature, date_of_study]).save() concentration = ln.Feature(name="concentration", dtype=str).save() donor = ln.Feature(name="donor", dtype=str, nullable=True).save() schema = ln.Schema( features=[concentration, donor], slots={"__external__": external_schema}, otype="DataFrame", ).save() artifact = ln.Artifact.from_dataframe( df, key="examples/dataset1.parquet", features={"temperature": 21.6, "date_of_study": date(2024, 10, 1)}, schema=schema, ).save() artifact.describe() ================================================ FILE: docs/scripts/curate_dataframe_flexible.py ================================================ import lamindb as ln ln.examples.datasets.mini_immuno.define_features_labels() df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") artifact = ln.Artifact.from_dataframe( df, key="examples/dataset1.parquet", schema="valid_features" ).save() artifact.describe() ================================================ FILE: docs/scripts/curate_dataframe_minimal_errors.py ================================================ import lamindb as ln schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible() df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") df.pop("donor") # remove donor column to trigger validation error try: artifact = ln.Artifact.from_dataframe( df, key="examples/dataset1.parquet", schema=schema ).save() except ln.errors.ValidationError as error: print(error) ================================================ FILE: docs/scripts/curate_dataframe_union_features.py ================================================ import lamindb as ln import pandas as pd union_feature = ln.Feature( name="mixed_feature", dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]", ).save() df_mixed = pd.DataFrame({"mixed_feature": ["UBERON:0000178", "CL:0000540"]}) schema = ln.Schema(features=[union_feature], coerce=True).save() curator = ln.curators.DataFrameCurator(df_mixed, schema) curator.validate() ================================================ FILE: docs/scripts/curate_mudata.py ================================================ import lamindb as ln import bionty as bt from docs.scripts.define_schema_df_metadata import study_metadata_schema # define labels perturbation = ln.Record(name="Perturbation", is_type=True).save() ln.Record(name="Perturbed", type=perturbation).save() ln.Record(name="NT", type=perturbation).save() replicate = ln.Record(name="Replicate", is_type=True).save() ln.Record(name="rep1", type=replicate).save() ln.Record(name="rep2", type=replicate).save() ln.Record(name="rep3", type=replicate).save() # define the global obs schema obs_schema = ln.Schema( name="mudata_papalexi21_subset_obs_schema", features=[ ln.Feature(name="perturbation", dtype="cat[Record[Perturbation]]").save(), ln.Feature(name="replicate", dtype="cat[Record[Replicate]]").save(), ], ).save() # define the ['rna'].obs schema obs_schema_rna = ln.Schema( name="mudata_papalexi21_subset_rna_obs_schema", features=[ ln.Feature(name="nCount_RNA", dtype=int).save(), ln.Feature(name="nFeature_RNA", dtype=int).save(), ln.Feature(name="percent.mito", dtype=float).save(), ], ).save() # define the ['hto'].obs schema obs_schema_hto = ln.Schema( name="mudata_papalexi21_subset_hto_obs_schema", features=[ ln.Feature(name="nCount_HTO", dtype=float).save(), ln.Feature(name="nFeature_HTO", dtype=int).save(), ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(), ], ).save() # define ['rna'].var schema var_schema_rna = ln.Schema( name="mudata_papalexi21_subset_rna_var_schema", itype=bt.Gene.symbol, dtype=float, ).save() # define composite schema mudata_schema = ln.Schema( name="mudata_papalexi21_subset_mudata_schema", otype="MuData", slots={ "obs": obs_schema, "rna:obs": obs_schema_rna, "hto:obs": obs_schema_hto, "rna:var": var_schema_rna, "uns:study_metadata": study_metadata_schema, }, ).save() # curate a MuData mdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True) bt.settings.organism = "human" # set the organism to map gene symbols curator = ln.curators.MuDataCurator(mdata, mudata_schema) artifact = curator.save_artifact(key="examples/mudata_papalexi21_subset.h5mu") assert artifact.schema == mudata_schema ================================================ FILE: docs/scripts/curate_soma_experiment.py ================================================ import lamindb as ln import bionty as bt import tiledbsoma as soma import tiledbsoma.io adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") tiledbsoma.io.from_anndata("small_dataset.tiledbsoma", adata, measurement_name="RNA") obs_schema = ln.Schema( name="soma_obs_schema", features=[ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(), ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(), ], ).save() var_schema = ln.Schema( name="soma_var_schema", features=[ ln.Feature(name="var_id", dtype=bt.Gene.ensembl_gene_id).save(), ], coerce=True, ).save() soma_schema = ln.Schema( name="soma_experiment_schema", otype="tiledbsoma", slots={ "obs": obs_schema, "ms:RNA.T": var_schema, }, ).save() with soma.Experiment.open("small_dataset.tiledbsoma") as experiment: curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema) curator.validate() artifact = curator.save_artifact( key="examples/soma_experiment.tiledbsoma", description="SOMA experiment with schema validation", ) assert artifact.schema == soma_schema artifact.describe() ================================================ FILE: docs/scripts/curate_spatialdata.py ================================================ import lamindb as ln spatialdata = ln.examples.datasets.spatialdata_blobs() sdata_schema = ln.Schema.get(name="spatialdata_blobs_schema") curator = ln.curators.SpatialDataCurator(spatialdata, sdata_schema) try: curator.validate() except ln.errors.ValidationError: pass spatialdata.tables["table"].var.drop(index="ENSG00000999999", inplace=True) # validate again (must pass now) and save artifact artifact = ln.Artifact.from_spatialdata( spatialdata, key="examples/spatialdata1.zarr", schema=sdata_schema ).save() artifact.describe() ================================================ FILE: docs/scripts/define_schema_anndata_uns.py ================================================ import lamindb as ln from define_schema_df_metadata import study_metadata_schema anndata_uns_schema = ln.Schema( otype="AnnData", slots={ "uns:study_metadata": study_metadata_schema, }, ).save() ================================================ FILE: docs/scripts/define_schema_df_metadata.py ================================================ import lamindb as ln study_metadata_schema = ln.Schema( name="Study metadata schema", features=[ ln.Feature(name="temperature", dtype=float).save(), ln.Feature(name="experiment", dtype=str).save(), ], ).save() ================================================ FILE: docs/scripts/define_schema_spatialdata.py ================================================ import lamindb as ln import bionty as bt # a very comprehensive schema for different slots of a SpatialData object # define or query features bio_dict = ln.Feature(name="bio", dtype=dict).save() tech_dict = ln.Feature(name="tech", dtype=dict).save() disease = ln.Feature(name="disease", dtype=bt.Disease, coerce=True).save() developmental_stage = ln.Feature( name="developmental_stage", dtype=bt.DevelopmentalStage, coerce=True, ).save() assay = ln.Feature(name="assay", dtype=bt.ExperimentalFactor, coerce=True).save() sample_region = ln.Feature(name="sample_region", dtype=str).save() analysis = ln.Feature(name="analysis", dtype=str).save() # define or query schema components attrs_schema = ln.Schema([bio_dict, tech_dict]).save() sample_schema = ln.Schema([disease, developmental_stage]).save() tech_schema = ln.Schema([assay]).save() obs_schema = ln.Schema([sample_region]).save() uns_schema = ln.Schema([analysis]).save() # enforces only registered Ensembl Gene IDs pass validation (maximal_set=True) varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save() # compose the SpatialData schema sdata_schema = ln.Schema( name="spatialdata_blobs_schema", otype="SpatialData", slots={ "attrs:bio": sample_schema, "attrs:tech": tech_schema, "attrs": attrs_schema, "tables:table:obs": obs_schema, "tables:table:var.T": varT_schema, }, ).save() ================================================ FILE: docs/scripts/my_workflow.py ================================================ import lamindb as ln @ln.flow() def ingest_dataset(key: str) -> ln.Artifact: df = ln.examples.datasets.mini_immuno.get_dataset1() artifact = ln.Artifact.from_dataframe(df, key=key).save() return artifact if __name__ == "__main__": ingest_dataset(key="my_analysis/dataset.parquet") ================================================ FILE: docs/scripts/my_workflow_with_click.py ================================================ import click import lamindb as ln @click.command() @click.option("--key", required=True) @ln.flow() def main(key: str): df = ln.examples.datasets.mini_immuno.get_dataset2() ln.Artifact.from_dataframe(df, key=key).save() if __name__ == "__main__": main() ================================================ FILE: docs/scripts/my_workflow_with_step.py ================================================ import lamindb as ln @ln.step() def subset_dataframe( artifact: ln.Artifact, subset_rows: int = 2, subset_cols: int = 2, ) -> ln.Artifact: df = artifact.load() new_data = df.iloc[:subset_rows, :subset_cols] new_key = artifact.key.replace(".parquet", "_subsetted.parquet") return ln.Artifact.from_dataframe(new_data, key=new_key).save() @ln.flow() def ingest_dataset(key: str, subset: bool = False) -> ln.Artifact: df = ln.examples.datasets.mini_immuno.get_dataset1() artifact = ln.Artifact.from_dataframe(df, key=key).save() if subset: artifact = subset_dataframe(artifact) return artifact if __name__ == "__main__": ingest_dataset(key="my_analysis/dataset.parquet", subset=True) ================================================ FILE: docs/scripts/run_script_with_step.py ================================================ import argparse import lamindb as ln @ln.step() def subset_dataframe( artifact: ln.Artifact, subset_rows: int = 2, subset_cols: int = 2, run: ln.Run | None = None, ) -> ln.Artifact: dataset = artifact.load(is_run_input=run) new_data = dataset.iloc[:subset_rows, :subset_cols] new_key = artifact.key.replace(".parquet", "_subsetted.parquet") return ln.Artifact.from_dataframe(new_data, key=new_key, run=run).save() if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--subset", action="store_true") args = p.parse_args() params = {"is_subset": args.subset} ln.track(params=params) if args.subset: df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") artifact = ln.Artifact.from_dataframe( df, key="my_analysis/dataset.parquet" ).save() subsetted_artifact = subset_dataframe(artifact) ln.finish() ================================================ FILE: docs/scripts/run_track_and_finish.py ================================================ import lamindb as ln ln.track() # initiate a tracked notebook/script run # your code automatically tracks inputs & outputs ln.finish() # mark run as finished, save execution report, source code & environment ================================================ FILE: docs/scripts/run_track_with_features_and_params.py ================================================ import argparse import lamindb as ln if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--s3-folder", type=str) p.add_argument("--experiment", type=str) args = p.parse_args() features = { "s3_folder": args.s3_folder, "experiment": args.experiment, } ln.track(features=features, params={"example_param": 42}) # your code ln.finish() ================================================ FILE: docs/scripts/run_track_with_params.py ================================================ import argparse import lamindb as ln if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("--input-dir", type=str) p.add_argument("--downsample", action="store_true") p.add_argument("--learning-rate", type=float) args = p.parse_args() params = { "input_dir": args.input_dir, "learning_rate": args.learning_rate, "preprocess_params": { "downsample": args.downsample, "normalization": "the_good_one", }, } ln.track(params=params) # your code ln.finish() ================================================ FILE: docs/scripts/synced_with_git.py ================================================ import lamindb as ln ln.settings.sync_git_repo = "https://github.com/..." ln.track() # your code ln.finish() ================================================ FILE: docs/storage/add-replace-cache.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "# Add, replace, cache and delete artifacts" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import pytest\n", "import shutil\n", "import lamindb as ln" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "ln.setup.login(\"testuser1\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": { "tags": [ "hide-output", "hide-cell" ] }, "outputs": [], "source": [ "try:\n", " root_path = ln.UPath(\"s3://lamindb-ci/test-add-replace-cache\")\n", " if root_path.exists():\n", " root_path.rmdir()\n", " ln.setup.delete(\"testuser1/test-add-replace-cache\", force=True)\n", "except BaseException: # noqa: S110\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": {}, "outputs": [], "source": [ "ln.setup.init(storage=\"s3://lamindb-ci/test-add-replace-cache\")" ] }, { "cell_type": "markdown", "id": "5", "metadata": {}, "source": [ "## Save with auto-managed (`key=None`)" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": {}, "outputs": [], "source": [ "AUTO_KEY_PREFIX = ln.core.storage.paths.AUTO_KEY_PREFIX\n", "root = ln.settings.storage.root" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact(\"./test-files/iris.csv\", description=\"iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "9", "metadata": {}, "outputs": [], "source": [ "key_path = root / f\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\"\n", "assert key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": {}, "outputs": [], "source": [ "cache_csv_path = artifact.cache()\n", "print(cache_csv_path)\n", "assert cache_csv_path.suffix == \".csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "11", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/iris.data\")\n", "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "12", "metadata": {}, "outputs": [], "source": [ "old_key_path = key_path\n", "new_key_path = root / f\"{AUTO_KEY_PREFIX}{artifact.uid}{artifact.suffix}\"" ] }, { "cell_type": "markdown", "id": "13", "metadata": {}, "source": [ "The suffix changed:" ] }, { "cell_type": "code", "execution_count": null, "id": "14", "metadata": {}, "outputs": [], "source": [ "print(old_key_path)\n", "print(new_key_path)\n", "assert not old_key_path.exists()\n", "assert new_key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "15", "metadata": {}, "outputs": [], "source": [ "cache_data_path = artifact.cache()\n", "print(cache_data_path)\n", "assert cache_data_path.suffix == \".data\"\n", "assert cache_data_path.stat().st_mtime >= cache_csv_path.stat().st_mtime" ] }, { "cell_type": "code", "execution_count": null, "id": "16", "metadata": {}, "outputs": [], "source": [ "artifact.delete(permanent=True)" ] }, { "cell_type": "markdown", "id": "17", "metadata": {}, "source": [ "## Save with manually passed real `key`" ] }, { "cell_type": "code", "execution_count": null, "id": "18", "metadata": {}, "outputs": [], "source": [ "ln.settings.creation._artifact_use_virtual_keys = False" ] }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact(\"./test-files/iris.csv\", key=\"iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "20", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "21", "metadata": {}, "outputs": [], "source": [ "key_path = root / \"iris.csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "22", "metadata": {}, "outputs": [], "source": [ "assert key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "23", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/new_iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "24", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "markdown", "id": "25", "metadata": {}, "source": [ "Check paths: no changes here, as the suffix didn't change." ] }, { "cell_type": "code", "execution_count": null, "id": "26", "metadata": {}, "outputs": [], "source": [ "old_key_path = key_path\n", "new_key_path = root / \"new_iris.csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "27", "metadata": {}, "outputs": [], "source": [ "old_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "28", "metadata": {}, "outputs": [], "source": [ "new_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "29", "metadata": {}, "outputs": [], "source": [ "assert old_key_path.exists()\n", "assert not new_key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "30", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/iris.data\")" ] }, { "cell_type": "code", "execution_count": null, "id": "31", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "32", "metadata": {}, "outputs": [], "source": [ "new_key_path = root / \"iris.data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "33", "metadata": {}, "outputs": [], "source": [ "old_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "34", "metadata": {}, "outputs": [], "source": [ "new_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "35", "metadata": {}, "outputs": [], "source": [ "assert not old_key_path.exists()\n", "assert new_key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "36", "metadata": {}, "outputs": [], "source": [ "artifact.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "37", "metadata": {}, "source": [ "## Save from memory" ] }, { "cell_type": "code", "execution_count": null, "id": "38", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "39", "metadata": {}, "outputs": [], "source": [ "iris = pd.read_csv(\"./test-files/iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "40", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact.from_dataframe(\n", " iris, description=\"iris_store\", key=\"iris.parquet\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "41", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "42", "metadata": {}, "outputs": [], "source": [ "key_path = root / \"iris.parquet\"" ] }, { "cell_type": "code", "execution_count": null, "id": "43", "metadata": {}, "outputs": [], "source": [ "assert key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "44", "metadata": {}, "outputs": [], "source": [ "artifact.replace(data=iris[:-1])" ] }, { "cell_type": "code", "execution_count": null, "id": "45", "metadata": {}, "outputs": [], "source": [ "assert artifact.key == \"iris.parquet\"" ] }, { "cell_type": "code", "execution_count": null, "id": "46", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "47", "metadata": {}, "outputs": [], "source": [ "assert key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "48", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/new_iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "49", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "50", "metadata": {}, "outputs": [], "source": [ "old_key_path = key_path\n", "new_key_path = root / \"iris.csv\"" ] }, { "cell_type": "code", "execution_count": null, "id": "51", "metadata": {}, "outputs": [], "source": [ "old_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "52", "metadata": {}, "outputs": [], "source": [ "new_key_path" ] }, { "cell_type": "code", "execution_count": null, "id": "53", "metadata": {}, "outputs": [], "source": [ "assert not old_key_path.exists()\n", "assert new_key_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "54", "metadata": {}, "outputs": [], "source": [ "# we use the path in the next sections\n", "path_in_storage = artifact.path\n", "artifact.delete(permanent=True, storage=False)" ] }, { "cell_type": "markdown", "id": "55", "metadata": {}, "source": [ "## Save with manually passed virtual `key`" ] }, { "cell_type": "code", "execution_count": null, "id": "56", "metadata": {}, "outputs": [], "source": [ "ln.settings.creation._artifact_use_virtual_keys = True" ] }, { "cell_type": "code", "execution_count": null, "id": "57", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact(\"./test-files/iris.csv\", key=\"iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "58", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "59", "metadata": {}, "outputs": [], "source": [ "with pytest.raises(ValueError) as err:\n", " artifact.replace(path_in_storage)\n", "assert err.exconly().startswith(\n", " \"ValueError: Can only replace with a local path not in any Storage.\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "60", "metadata": {}, "outputs": [], "source": [ "# return an existing artifact if the hash is the same\n", "assert artifact == artifact.replace(\"./test-files/iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "61", "metadata": {}, "outputs": [], "source": [ "fpath = artifact.path\n", "assert fpath.suffix == \".csv\" and fpath.stem == artifact.uid" ] }, { "cell_type": "code", "execution_count": null, "id": "62", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/iris.data\")" ] }, { "cell_type": "code", "execution_count": null, "id": "63", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "64", "metadata": {}, "outputs": [], "source": [ "assert artifact.key == \"iris.data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "65", "metadata": {}, "outputs": [], "source": [ "assert not fpath.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "66", "metadata": {}, "outputs": [], "source": [ "fpath = artifact.path\n", "assert fpath.suffix == \".data\" and fpath.stem == artifact.uid" ] }, { "cell_type": "code", "execution_count": null, "id": "67", "metadata": {}, "outputs": [], "source": [ "artifact.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "68", "metadata": {}, "source": [ "## Save in existing storage with a virtual `key`" ] }, { "cell_type": "code", "execution_count": null, "id": "69", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact(path_in_storage, key=\"iris_test.csv\").save()" ] }, { "cell_type": "code", "execution_count": null, "id": "70", "metadata": {}, "outputs": [], "source": [ "assert artifact._real_key.endswith(\"iris.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "71", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/iris.data\")" ] }, { "cell_type": "code", "execution_count": null, "id": "72", "metadata": {}, "outputs": [], "source": [ "assert artifact._real_key.endswith(\"iris.data\")\n", "assert artifact._clear_storagekey.endswith(\"iris.csv\")\n", "assert artifact.key == \"iris_test.data\"" ] }, { "cell_type": "code", "execution_count": null, "id": "73", "metadata": {}, "outputs": [], "source": [ "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "74", "metadata": {}, "outputs": [], "source": [ "path = artifact.path\n", "\n", "assert path.name == \"iris.data\"\n", "assert path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "75", "metadata": {}, "outputs": [], "source": [ "assert not path_in_storage.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "76", "metadata": {}, "outputs": [], "source": [ "artifact.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "77", "metadata": {}, "source": [ "## Replace with folder artifacts" ] }, { "cell_type": "code", "execution_count": null, "id": "78", "metadata": {}, "outputs": [], "source": [ "adata = ln.examples.datasets.anndata_pbmc68k_reduced()\n", "\n", "adata.write_zarr(\"./test-files/pbmc68k.zarr\")" ] }, { "cell_type": "code", "execution_count": null, "id": "79", "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact(\"./test-files/pbmc68k.zarr\", key=\"pbmc68k.zarr\").save()\n", "save_hash = artifact.hash\n", "save_n_files = artifact.n_files" ] }, { "cell_type": "code", "execution_count": null, "id": "80", "metadata": {}, "outputs": [], "source": [ "with pytest.raises(ValueError) as err:\n", " artifact.replace(\"./test-files/iris.csv\")\n", "assert err.exconly().endswith(\"It is not allowed to replace a folder with a file.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "81", "metadata": {}, "outputs": [], "source": [ "assert save_hash is not None\n", "assert artifact.path.is_dir()" ] }, { "cell_type": "code", "execution_count": null, "id": "82", "metadata": {}, "outputs": [], "source": [ "adata.obs[\"add_new_col\"] = \"new\"\n", "\n", "adata.write_zarr(\"./test-files/pbmc68k_new.zarr\")" ] }, { "cell_type": "code", "execution_count": null, "id": "83", "metadata": {}, "outputs": [], "source": [ "artifact.replace(\"./test-files/pbmc68k_new.zarr\")\n", "artifact.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "84", "metadata": {}, "outputs": [], "source": [ "assert artifact.key == \"pbmc68k.zarr\"\n", "assert artifact.hash != save_hash\n", "assert artifact.n_files != save_n_files\n", "assert artifact.path.is_dir()" ] }, { "cell_type": "code", "execution_count": null, "id": "85", "metadata": {}, "outputs": [], "source": [ "shutil.rmtree(artifact.cache())" ] }, { "cell_type": "code", "execution_count": null, "id": "86", "metadata": {}, "outputs": [], "source": [ "with artifact.open() as store:\n", " assert \"add_new_col\" in store.obs" ] }, { "cell_type": "code", "execution_count": null, "id": "87", "metadata": {}, "outputs": [], "source": [ "# checks that .open above opened the cloud path without syncing\n", "assert not artifact._cache_path.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "88", "metadata": {}, "outputs": [], "source": [ "shutil.rmtree(\"./test-files/pbmc68k.zarr\")\n", "shutil.rmtree(\"./test-files/pbmc68k_new.zarr\")" ] }, { "cell_type": "code", "execution_count": null, "id": "89", "metadata": {}, "outputs": [], "source": [ "artifact.delete(permanent=True, storage=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "90", "metadata": { "tags": [] }, "outputs": [], "source": [ "ln.setup.delete(\"test-add-replace-cache\", force=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" }, "nbproject": { "id": "uBQMCcdYwEjA", "parent": null, "pypackage": null, "time_init": "2023-04-04T16:26:17.675023+00:00", "user_handle": "Koncopd", "user_id": "qTQ5q0ar", "user_name": "Sergei Rybakov", "version": "0" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: docs/storage/anndata-accessor.ipynb ================================================ { "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Test `AnnDataAccessor`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "\n", "ln.setup.init(storage=\"s3://lamindb-ci/test-anndata\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'll need some test data:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "ln.Artifact(\"s3://lamindb-ci/test-anndata/pbmc68k.h5ad\").save()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An `h5ad` artifact stored on s3:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact.filter(key=\"pbmc68k.h5ad\").one()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact.path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata = artifact.open()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "It is possible to access `AnnData` attributes without loading them into memory" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "print(adata.obsm)\n", "print(adata.varm)\n", "print(adata.obsp)\n", "print(adata.varm)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "However, `.obs`, `.var` and `.uns` are always loaded fully into memory on `AnnDataAccessor` initialization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.obs.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.var.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.uns.keys()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Without subsetting, the `AnnDataAccessor` object gives references to underlying lazy `h5` or `zarr` arrays:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.X" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.obsm[\"X_pca\"]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "And to a lazy `SparseDataset` from the `anndata` package:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata.obsp[\"distances\"]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Get a subset of the object, attributes are loaded only on explicit access:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "obs_idx = adata.obs.cell_type.isin([\"Dendritic cells\", \"CD14+ Monocytes\"]) & (\n", " adata.obs.percent_mito <= 0.05\n", ")\n", "adata_subset = adata[obs_idx]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata_subset" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Check shapes of the subset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "num_idx = sum(obs_idx)\n", "assert adata_subset.shape == (num_idx, adata.shape[1])\n", "assert (adata_subset.obs.cell_type == \"CD34+\").sum() == 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata_subset.obs.cell_type.value_counts()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Subsets load the arrays into memory only on direct access" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(adata_subset.X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(adata_subset.obsm[\"X_pca\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "assert adata_subset.obsp[\"distances\"].shape[0] == num_idx" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "To load the entire subset into memory as an actual `AnnData` object, use `to_memory()`:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "adata_subset.to_memory()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "!lamin delete --force test-anndata" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" }, "nbproject": { "id": "YVUCtH4GfQOy", "parent": null, "pypackage": null, "time_init": "2023-01-23T08:28:32.097943+00:00", "user_handle": "testuser1", "user_id": "DzTjkKse", "user_name": "Test User1", "version": "0" }, "vscode": { "interpreter": { "hash": "ae1fefc8646a06dd2e75004cd934adda7c5727b046986a772e3b44b0ffba9754" } } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: docs/storage/prepare-sync-local-to-cloud.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare sync artifacts from a local instance to a cloud instance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!lamin disconnect\n", "# need to add pertdb to environment in order to import it\n", "!lamin settings modules set bionty,pertdb" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "import bionty as bt\n", "import pertdb\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ln.setup.init(storage=\"./test-sync-to-cloud\", modules=\"bionty,pertdb\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact.from_dataframe(\n", " pd.DataFrame({\"a\": [1, 2, 3]}), description=\"test-sync-to-cloud\"\n", ").save()\n", "features = bt.CellMarker.from_values(\n", " [\"PD1\", \"CD21\"], field=bt.CellMarker.name, organism=\"human\"\n", ").save()\n", "artifact.features._add_schema(ln.Schema(features), slot=\"var\")\n", "organism = bt.Organism.from_source(name=\"human\").save()\n", "artifact.labels.add(organism)\n", "compound = pertdb.Compound(name=\"compound-test-sync-to-cloud\").save()\n", "artifact.compounds.add(compound)\n", "artifact.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "assert artifact.features.slots[\"var\"].members.count() == 2" ] } ], "metadata": { "kernelspec": { "display_name": "py312", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: docs/storage/sync-local-to-cloud.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sync artifacts from a local instance to a cloud instance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "\n", "ln.connect(\"laminlabs/lamin-dev\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "def cleanup(artifact: ln.Artifact):\n", " features_sets = artifact.schemas.all()\n", " compounds = artifact.compounds.all()\n", " artifact.delete(permanent=True, storage=False)\n", " features_sets.delete()\n", " compounds.delete()\n", "\n", "\n", "artifacts = ln.Artifact.filter(description=\"test-sync-to-cloud\")\n", "for artifact in artifacts:\n", " cleanup(artifact)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact = ln.Artifact.connect(\"testuser1/test-sync-to-cloud\").get(\n", " description=\"test-sync-to-cloud\"\n", ")\n", "artifact.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact.save(transfer=\"annotations\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "artifact.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "assert artifact._state.db == \"default\"\n", "assert artifact.organisms.get().name == \"human\"\n", "assert artifact.compounds.get().name == \"compound-test-sync-to-cloud\"\n", "assert artifact.features.slots[\"var\"].members.count() == 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-cell" ] }, "outputs": [], "source": [ "!rm -r ./test-sync-to-cloud\n", "!lamin delete --force test-sync-to-cloud" ] } ], "metadata": { "kernelspec": { "display_name": "py312", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: docs/storage/test-files/iris.data ================================================ 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 7.0,3.2,4.7,1.4,Iris-versicolor 6.4,3.2,4.5,1.5,Iris-versicolor 6.2,3.4,5.4,2.3,Iris-virginica 5.9,3.0,5.1,1.8,Iris-virginica ================================================ FILE: docs/storage/test_notebooks.py ================================================ from pathlib import Path import nbproject_test as test import lamindb as ln def test_notebooks(): nbdir = Path(__file__).parent ln.setup.login("testuser1") test.execute_notebooks(nbdir, write=True) ================================================ FILE: docs/storage/upload.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "# Track artifacts, in-memory objects & folders [S3 storage]" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "import pytest" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "ln.setup.login(\"testuser1\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": { "tags": [ "hide-output", "hide-cell" ] }, "outputs": [], "source": [ "try:\n", " root_path = ln.UPath(\"s3://lamindb-ci/test-upload\")\n", " if root_path.exists():\n", " root_path.rmdir()\n", " ln.setup.delete(\"testuser1/test-upload\", force=True)\n", "except BaseException: # noqa: S110\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": {}, "outputs": [], "source": [ "ln.setup.init(storage=\"s3://lamindb-ci/test-upload\")" ] }, { "cell_type": "markdown", "id": "5", "metadata": {}, "source": [ "## Local artifacts" ] }, { "cell_type": "markdown", "id": "6", "metadata": {}, "source": [ "Some test data." ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": {}, "outputs": [], "source": [ "pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()" ] }, { "cell_type": "markdown", "id": "8", "metadata": {}, "source": [ "Subset to a mini artifact to speed up the run time of this notebook:" ] }, { "cell_type": "code", "execution_count": null, "id": "9", "metadata": {}, "outputs": [], "source": [ "pbmc68k = pbmc68k[:5, :5].copy()" ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": {}, "outputs": [], "source": [ "pbmc68k" ] }, { "cell_type": "markdown", "id": "11", "metadata": {}, "source": [ "### Upload from memory using explicit semantic `key`" ] }, { "cell_type": "markdown", "id": "12", "metadata": {}, "source": [ "#### Upload h5ad" ] }, { "cell_type": "code", "execution_count": null, "id": "13", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, key=\"test-upload/pbmc68k.h5ad\")" ] }, { "cell_type": "code", "execution_count": null, "id": "14", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "15", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad.delete(permanent=True)" ] }, { "cell_type": "markdown", "id": "16", "metadata": {}, "source": [ "#### Upload zarr" ] }, { "cell_type": "code", "execution_count": null, "id": "17", "metadata": {}, "outputs": [], "source": [ "# Runs too long, should be tested elsewhere\n", "# pbmc68k_zarr = ln.Artifact(pbmc68k, key=\"test-upload/pbmc68k.zarr\", format=\"zarr\")\n", "# ln.save(pbmc68k_zarr)\n", "# pbmc68k_zarr.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "18", "metadata": {}, "source": [ "### Upload using `id` with implicit `key`" ] }, { "cell_type": "markdown", "id": "19", "metadata": {}, "source": [ "#### Upload h5ad" ] }, { "cell_type": "code", "execution_count": null, "id": "20", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad = ln.Artifact.from_anndata(pbmc68k, description=\"pbmc68k.h5ad\")" ] }, { "cell_type": "code", "execution_count": null, "id": "21", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad.save()" ] }, { "cell_type": "code", "execution_count": null, "id": "22", "metadata": {}, "outputs": [], "source": [ "pbmc68k_h5ad.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "23", "metadata": {}, "source": [ "#### Upload zarr" ] }, { "cell_type": "code", "execution_count": null, "id": "24", "metadata": {}, "outputs": [], "source": [ "# Runs too long, should be tested elsewhere\n", "# pbmc68k_zarr = ln.Artifact(pbmc68k, name=\"pbmc68k.zarr\", format=\"zarr\")\n", "# ln.save(pbmc68k_zarr)\n", "# pbmc68k_zarr.delete(permanent=True, storage=True)" ] }, { "cell_type": "markdown", "id": "25", "metadata": { "tags": [] }, "source": [ "### Error behaviors" ] }, { "cell_type": "markdown", "id": "26", "metadata": {}, "source": [ "Specified file does not exist." ] }, { "cell_type": "code", "execution_count": null, "id": "27", "metadata": {}, "outputs": [], "source": [ "with pytest.raises(FileNotFoundError):\n", " non_existent_h5ad = ln.Artifact(\n", " \"s3://lamindb-ci/test-upload/non_existent_file.h5ad\"\n", " )" ] }, { "cell_type": "markdown", "id": "28", "metadata": {}, "source": [ "Specified buket does not exist. Normally non-existent bucket raises `FileNotFoundError`, but sometimes strarts to raise `PermissionError`." ] }, { "cell_type": "code", "execution_count": null, "id": "29", "metadata": {}, "outputs": [], "source": [ "with pytest.raises((FileNotFoundError, PermissionError)):\n", " non_existent_h5ad = ln.Artifact(\n", " \"s3://non_existent_bucket_6612366/non_existent_file.h5ad\"\n", " )" ] }, { "cell_type": "markdown", "id": "30", "metadata": {}, "source": [ "## Test existing zarr" ] }, { "cell_type": "markdown", "id": "31", "metadata": {}, "source": [ "See `test_artifact.py` for other artifact types." ] }, { "cell_type": "markdown", "id": "32", "metadata": {}, "source": [ "This should probably go elsewhere:" ] }, { "cell_type": "code", "execution_count": null, "id": "33", "metadata": {}, "outputs": [], "source": [ "# temporarily comment out because of head bucket permission error when\n", "# attempting to get region\n", "# artifact = ln.Artifact(\"s3://lamindb-ci/lndb-storage/pbmc68k.zarr\")\n", "# artifact.save()\n", "# artifact.open()" ] }, { "cell_type": "code", "execution_count": null, "id": "34", "metadata": {}, "outputs": [], "source": [ "ln.setup.delete(\"test-upload\", force=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" }, "nbproject": { "id": "psZgub4FOmzS", "parent": null, "pypackage": null, "time_init": "2023-04-09T20:01:57.780053+00:00", "user_handle": "testuser1", "user_id": "DzTjkKse", "user_name": "Test User1", "version": "0" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: docs/storage/vitessce.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Vitessce integration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more comprehensive integration tests, see: https://github.com/laminlabs/lamin-spatial" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!lamin login testuser1\n", "!lamin init --storage \"s3://lamindb-ci/test-vitessce\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "import pytest\n", "from vitessce import (\n", " VitessceConfig,\n", " AnnDataWrapper,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up test data:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced()[:100, :200].copy()\n", "zarr_filepath = \"my_test.zarr\"\n", "# write the anndata to a local zarr path\n", "pbmc68k.write_zarr(zarr_filepath)\n", "# create an artifact from the path\n", "dataset_artifact = ln.Artifact(zarr_filepath, description=\"Test dataset\").save()\n", "# this is the where the zarr folder is located on a public S3 bucket\n", "dataset_artifact.path.to_url()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a `VitessceConfig` object: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "hide-output" ] }, "outputs": [], "source": [ "vc = VitessceConfig(schema_version=\"1.0.15\")\n", "vc.add_dataset(name=\"test1\").add_object(\n", " AnnDataWrapper(\n", " adata_artifact=dataset_artifact,\n", " obs_embedding_paths=[\"obsm/X_umap\"],\n", " ),\n", ")\n", "vc.to_dict()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vitessce_config_artifact = ln.integrations.save_vitessce_config(\n", " vc, description=\"View testdata in Vitessce\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# different equivalent ways of testing that the action is attached\n", "assert dataset_artifact._actions.get() == vitessce_config_artifact\n", "assert vitessce_config_artifact._action_targets.get() == dataset_artifact\n", "assert vitessce_config_artifact._actions.first() is None\n", "assert vitessce_config_artifact.kind == \"__lamindb_config__\"\n", "assert ln.Artifact.get(_actions=vitessce_config_artifact) == dataset_artifact" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset_artifact.delete(permanent=True)\n", "vitessce_config_artifact.delete(permanent=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test validation within `save_vitessce_config`:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# pass an artifact URL instead of the artifact object itself\n", "vc = VitessceConfig(schema_version=\"1.0.15\")\n", "with pytest.raises(AttributeError) as error:\n", " vc.add_dataset(name=\"test1\").add_object(\n", " AnnDataWrapper(\n", " adata_artifact=dataset_artifact.path.to_url(),\n", " obs_embedding_paths=[\"obsm/X_umap\"],\n", " ),\n", " )\n", "print(error.exconly())\n", "assert error.exconly().startswith(\n", " \"AttributeError: 'str' object has no attribute 'path'\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!rm -rf test-vitessce\n", "!lamin delete --force test-vitessce" ] } ], "metadata": { "kernelspec": { "display_name": "py312", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: docs/storage.md ================================================ # Storage ```{toctree} :maxdepth: 1 storage/upload storage/add-replace-cache storage/anndata-accessor storage/prepare-sync-local-to-cloud storage/sync-local-to-cloud storage/vitessce ``` ================================================ FILE: docs/sync.md ================================================ --- execute_via: python --- # Sync data across databases This guide shows how to sync objects from a source database to your default database. We need a target database: ```python !lamin init --storage ./test-sync --modules bionty ``` Import `lamindb` and optionally run `ln.track()`: ```python import lamindb as ln ln.track() ``` Syncing works for any object type (`Artifact`, `Record`, `Transform`, `ULabel`, etc.). Let's sync an artifact to our current default database: ```python db = ln.DB("laminlabs/lamindata") # query the artifact on the source database artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad") # sync the artifact to the current database artifact.save() ``` If you also want to sync feature & label annotations, pass `transfer="annotations"`: ```python # query again so that `artifact` holds the object on the source database artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad") # sync the artifact to the current database, including transfer of annotations where necessary artifact.save(transfer="annotations") ``` The artifact now has all feature & label annotations: ```python artifact.describe() ``` The sync is zero-copy, which means that the data itself remained in the original storage location: ```python artifact.path ``` Data lineage indicates the source database of the sync: ```python artifact.view_lineage() ``` The run that initiated the sync is linked via `initiated_by_run`: ```python artifact.run.initiated_by_run.transform ``` Upon calling `.save()` again, `lamindb` identifies that the object already exists in the target database and simply maps it: ```python artifact = db.Artifact.get(key="example_datasets/mini_immuno/dataset1.h5ad") artifact.save() ``` ```{dropdown} How do I know if an object is in the default database or elsewhere? Every `SQLRecord` object has an attribute `._state.db` which can take the following values: - `None`: the object has not yet been saved to any database - `"default"`: the object is saved on the default database instance - `"account/name"`: the object is saved on a non-default database instance referenced by `account/name` (e.g., `laminlabs/lamindata`) ``` ```python tags=["hide-cell"] # test the last 3 cells here assert artifact.transform.description == "Transfer from `laminlabs/lamindata`" assert artifact.transform.key == "__lamindb_transfer__/4XIuR0tvaiXM" assert artifact.transform.uid == "4XIuR0tvaiXM0000" assert artifact.run.initiated_by_run.transform.description.startswith("Sync data") ``` ================================================ FILE: docs/test_notebooks.py ================================================ import sys from pathlib import Path import nbproject_test as test sys.path[:0] = [str(Path(__file__).parent.parent)] from noxfile import GROUPS DOCS = Path(__file__).parents[1] / "docs/" def test_tutorial(): for artifactname in GROUPS["tutorial"]: test.execute_notebooks(DOCS / artifactname, write=True) def test_guide(): for artifactname in GROUPS["guide"]: test.execute_notebooks(DOCS / artifactname, write=True) def test_tiledbsoma(): for artifactname in GROUPS["tiledbsoma"]: test.execute_notebooks(DOCS / artifactname, write=True) def test_biology(): for artifactname in GROUPS["biology"]: test.execute_notebooks(DOCS / artifactname, write=True) ================================================ FILE: docs/track.md ================================================ --- execute_via: python --- # Track notebooks, scripts & workflows This guide walks from tracking data lineage in a notebook to tracking parameters in workflows. ```{raw} html ``` **Note:** To run examples, if you don't have a `lamindb` instance, create one: ```python !lamin init --storage ./test-track ``` ## Manage notebooks and scripts Call {meth}`~lamindb.track` to save your notebook or script as a `transform` and start tracking inputs & outputs of a run. ```{eval-rst} .. literalinclude:: scripts/run_track_and_finish.py :language: python ``` You find your notebooks and scripts in the {class}`~lamindb.Transform` registry along with pipelines & functions: ```python transform = ln.Transform.get(key="my_analyses/my_notebook.ipynb") transform.source_code # source code transform.runs.to_dataframe() # all runs in a dataframe transform.latest_run.report # report of latest run transform.latest_run.environment # environment of latest run ``` You can use the CLI to load a transform into your current (development) directory: ```bash lamin load --key my_analyses/my_notebook.ipynb ``` Here is how you'd load the [notebook from the video](https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ) into your local directory: ```bash lamin load https://lamin.ai/laminlabs/lamindata/transform/F4L3oC6QsZvQ ``` (sync-code-with-git)= ### Organize local development If no development directory is set, script & notebook keys equal their filenames. Otherwise, they represent the relative path in the development directory. The exception is packaged source code, whose keys have the form `pypackages/{package_name}/path/to/file.py`. To set the development directory to your current shell development directory, run: ```bash lamin settings set dev-dir . ``` You can see the current status by running: ```bash lamin info ``` When you `cd` into that directory, you will now auto-connect to the configured lamindb instance. To sync scripts or workflows with their correponding files in a git repo, either export an environment variable: ```shell export LAMINDB_SYNC_GIT_REPO = ``` Or set the following setting: ```python ln.settings.sync_git_repo = ``` If you work on a single project in your lamindb instance, it makes sense to set LaminDB's `dev-dir` to the root of the local git repo clone. ```bash dbs/ project1/ .git/ .lamin/ script1.py notebook1.ipynb ... ``` If you work on multiple projects in your lamindb instance, you can use the `dev-dir` as the local root and nest git repositories in it. ```bash dbs/ database1/ .lamin/ repo1/ .git/ repo2/ .git/ ... ``` ### Use projects You can link the entities created during a run to a project. ```python import lamindb as ln my_project = ln.Project(name="My project").save() # create & save a project ln.track(project="My project") # pass project open("sample.fasta", "w").write(">seq1\nACGT\n") # create a dataset ln.Artifact("sample.fasta", key="sample.fasta").save() # auto-labeled by project ``` Filter entities by project, e.g., artifacts: ```python ln.Artifact.filter(projects=my_project).to_dataframe() ``` Access entities linked to a project: ```python my_project.artifacts.to_dataframe() ``` The same works for `my_project.transforms` or `my_project.runs`. ### Use spaces You can write the entities created during a run into a space that you configure on LaminHub. This is particularly useful if you want to restrict access to a space. Note that this doesn't affect bionty entities who should typically be commonly accessible. ```python ln.track(space="Our team space") ``` ### Track agent plans Saving an agent plan automatically tags with `artifact.kind = "plan"` and infers a `key` starting with `.plans/`: ```bash lamin save /path/to/.cursor/plans/my_task.plan.md lamin save /path/to/.claude/plans/my_task.md ``` Link an agent plan against a run: ```python ln.track(plan=".plans/my-agent-plan.md") ``` This links the `plan` artifact to a run in the same way as `transform`, an initiating run (`initiated_by_run`), and `report` / `environment` artifacts are linked to the run. While `transform` acts as the deterministic source code for the run and `initiated_by_run` enables higher-level runs in workflow orchestration, the agent `plan` complements these by linking a plan that steers a non-deterministic agent. (manage-workflows)= ## Manage workflows Here we'll manage workflows with `lamindb`'s {func}`~lamindb.flow` and {func}`~lamindb.step` decorators, which works out-of-the-box with the majority of Python workflow managers: | tool | workflow decorator | step/task decorator | notes | | --------- | ------------------ | ------------------- | ---------------------------------------------- | | `lamindb` | `@flow` | `@step` | inspired by `prefect` | | `prefect` | `@flow` | `@task` | two decorators | | `redun` | `@task` (on main) | `@task` | single decorator for everything | | `dagster` | `@job` or `@asset` | `@op` or `@asset` | asset-centric; `@asset` is primary | | `flyte` | `@workflow` | `@task` | also `@dynamic` for runtime DAGs | | `airflow` | `@dag` | `@task` | TaskFlow API (modern); also supports operators | | `zenml` | `@pipeline` | `@step` | inspired by `prefect` | If you're looking for more in-depth examples or for integrating with non-decorator-based workflow managers such as Nextflow or Snakemake, see {doc}`docs:pipelines`. | tool | workflow | step/task | notes | | ----------- | ------------------ | ----------------- | ---------------- | | `nextflow` | `workflow` keyword | `process` keyword | groovy-based DSL | | `snakemake` | `rule` keyword | `rule` keyword | file-based DSL | | `metaflow` | `FlowSpec` | `@step` | class-based | | `kedro` | `Pipeline()` | `node()` | function-based | ### A one-step workflow Decorate a function with {func}`~lamindb.flow` to track it as a workflow: ```{eval-rst} .. literalinclude:: scripts/my_workflow.py :language: python :caption: my_workflow.py ``` Let's run the workflow: ```python !python scripts/my_workflow.py ``` Query the workflow via its filename: ```python transform = ln.Transform.get(key="my_workflow.py") transform.describe() ``` The run stored the parameter value for `key`: ```python transform.latest_run.describe() ``` It links output artifacts: ```python transform.latest_run.output_artifacts.to_dataframe() ``` You can query for all runs that ran with that parameter: ```python ln.Run.filter( params__key="my_analysis/dataset.parquet", ).to_dataframe() ``` You can also pass complex parameters and features, see: {ref}`track-run-parameters`. ### A multi-step workflow Here, the workflow calls an additional processing step: ```{eval-rst} .. literalinclude:: scripts/my_workflow_with_step.py :language: python :caption: my_workflow_with_step.py ``` Let's run the workflow: ```python !python scripts/my_workflow_with_step.py ``` The lineage of the subsetted artifact resolves the subsetting step: ```python subsetted_artifact = ln.Artifact.get(key="my_analysis/dataset_subsetted.parquet") subsetted_artifact.view_lineage() ``` This is the run that created the subsetted_artifact: ```python subsetted_artifact.run ``` This is the initating run that triggered the function call: ```python subsetted_artifact.run.initiated_by_run ``` These are the parameters of the run: ```python subsetted_artifact.run.params ``` These are the input artifacts: ```python subsetted_artifact.run.input_artifacts.to_dataframe() ``` These are output artifacts: ```python subsetted_artifact.run.output_artifacts.to_dataframe() ``` ### A workflow with CLI arguments Let's use `click` to parse CLI arguments: ```{eval-rst} .. literalinclude:: scripts/my_workflow_with_click.py :language: python :caption: my_workflow_with_click.py ``` Let's run the workflow: ```python !python scripts/my_workflow_with_click.py --key my_analysis/dataset2.parquet ``` CLI arguments are tracked and accessible via `run.cli_args`: ```python run = ln.Run.filter(transform__key="my_workflow_with_click.py").first() run.describe() ``` Note that it doesn't matter whether you use `click`, `argparse`, or any other CLI argument parser. (track-run-parameters)= ## Track parameters & features We just saw that the function decorators `@ln.flow()` and `@ln.step()` track parameter values automatically. Here is how to pass parameters to `ln.track()`: ```{eval-rst} .. literalinclude:: scripts/run_track_with_params.py :language: python :caption: run_track_with_params.py ``` Run the script. ```python !python scripts/run_track_with_params.py --input-dir ./mydataset --learning-rate 0.01 --downsample ``` Query for all runs that match certain parameters: ```python ln.Run.filter( params__learning_rate=0.01, params__preprocess_params__downsample=True, ).to_dataframe() ``` Describe & get parameters: ```python run = ln.Run.filter(params__learning_rate=0.01).order_by("-started_at").first() run.describe() run.params ``` You can also access the CLI arguments used to start the run directly: ```python run.cli_args ``` You can also track run features in analogy to artifact features. In contrast to params, features are validated against the `Feature` registry and allow to express relationships with entities in your registries. Let's first define labels & features. ```python experiment_type = ln.Record(name="Experiment", is_type=True).save() experiment_label = ln.Record(name="Experiment1", type=experiment_type).save() ln.Feature(name="s3_folder", dtype=str).save() ln.Feature(name="experiment", dtype=experiment_type).save() ``` ```python !python scripts/run_track_with_features_and_params.py --s3-folder s3://my-bucket/my-folder --experiment Experiment1 ``` ```python ln.Run.filter(s3_folder="s3://my-bucket/my-folder").to_dataframe() ``` Describe & get feature values. ```python run2 = ln.Run.filter( s3_folder="s3://my-bucket/my-folder", experiment="Experiment1" ).last() run2.describe() run2.features.get_values() ``` ## Manage functions in scripts and notebooks If you want more-fined-grained data lineage tracking in a script or notebook where you called `ln.track()`, you can also use the `step()` decorator. ### In a notebook ```python @ln.step() def subset_dataframe( input_artifact_key: str, output_artifact_key: str, subset_rows: int = 2, subset_cols: int = 2, ) -> None: artifact = ln.Artifact.get(key=input_artifact_key) dataset = artifact.load() new_data = dataset.iloc[:subset_rows, :subset_cols] ln.Artifact.from_dataframe(new_data, key=output_artifact_key).save() ``` Prepare a test dataset: ```python df = ln.examples.datasets.mini_immuno.get_dataset1(otype="DataFrame") input_artifact_key = "my_analysis/dataset.parquet" artifact = ln.Artifact.from_dataframe(df, key=input_artifact_key).save() ``` Run the function with default params: ```python ouput_artifact_key = input_artifact_key.replace(".parquet", "_subsetted.parquet") subset_dataframe(input_artifact_key, ouput_artifact_key, subset_rows=1) ``` Query for the output: ```python subsetted_artifact = ln.Artifact.get(key=ouput_artifact_key) subsetted_artifact.view_lineage() ``` Re-run the function with a different parameter: ```python subsetted_artifact = subset_dataframe( input_artifact_key, ouput_artifact_key, subset_cols=3 ) subsetted_artifact = ln.Artifact.get(key=ouput_artifact_key) subsetted_artifact.view_lineage() ``` We created a new run: ```python subsetted_artifact.run ``` With new parameters: ```python subsetted_artifact.run.params ``` And a new version of the output artifact: ```python subsetted_artifact.run.output_artifacts.to_dataframe() ``` ### In a script ```{eval-rst} .. literalinclude:: scripts/run_script_with_step.py :language: python :caption: run_script_with_step.py ``` ```python !python scripts/run_script_with_step.py --subset ``` ```python ln.view() ``` ## The database See the state of the database after we ran these different examples: ```python ln.view() ``` ## Using transform versions as templates A transform acts like a template upon using `lamin load` to load it. Consider you run: ```bash lamin load https://lamin.ai/account/instance/transform/Akd7gx7Y9oVO0000 ``` Upon running the returned notebook or script, you'll automatically create a new version and be able to browse it via the version dropdown on the UI. Additionally, you can: - label using `ULabel` or `Record`, e.g., `transform.records.add(template_label)` - tag with an indicative `version` string, e.g., `transform.version = "T1"; transform.save()` :::{dropdown} Saving a notebook as an artifact Sometimes you might want to save a notebook as an artifact. This is how you can do it: ```bash lamin save template1.ipynb --key templates/template1.ipynb --description "Template for analysis type 1" --registry artifact ``` ::: A few checks at the end of this notebook: ```python assert run.params == { "input_dir": "./mydataset", "learning_rate": 0.01, "preprocess_params": {"downsample": True, "normalization": "the_good_one"}, }, run.params assert my_project.artifacts.exists() assert my_project.transforms.exists() assert my_project.runs.exists() ``` ================================================ FILE: lamindb/__init__.py ================================================ """A data framework for biology. Installation:: pip install lamindb If you just want to *read* data from a LaminDB instance, use :class:`~lamindb.DB`:: import lamindb as ln db = ln.DB("laminlabs/cellxgene") To *write* data, connect to a writable instance:: lamin login lamin connect account/name You can create an instance at `lamin.ai `__ and invite collaborators. If you prefer to work with a local database (no login required), run:: lamin init --storage ./quickstart-data --modules bionty LaminDB will then auto-connect upon import and you can then create & save objects like this:: import lamindb as ln # → connected lamindb: account/instance ln.Artifact("./my_dataset.parquet", key="datasets/my_dataset.parquet").save() Lineage ======= Track inputs, outputs, parameters, and environments of notebooks, scripts, and functions. .. autosummary:: :toctree: . track finish flow step Artifacts ========= The central `Artifact` registry holds files, folders & arrays across any number of storage locations. .. autosummary:: :toctree: . Artifact All other registries link to `Artifact` to provide context for finding, querying, validating, and managing artifacts. Here is an overview of the core data model: .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/HMfWLa1rFkxcxQEN0000.svg :width: 800px Transforms & runs ================= Data transformations and their executions. .. autosummary:: :toctree: . Transform Run Records, labels, features & schemas =================================== Create labels and manage flexible records, e.g., for samples or donors. .. autosummary:: :toctree: . Record ULabel Define features & schemas to validate artifacts & records. .. autosummary:: :toctree: . Feature Schema Managing operations =================== .. autosummary:: :toctree: . Project Storage User Branch Space Collection Reference Basic utilities =============== Connecting, viewing database content, accessing settings & run context. .. autosummary:: :toctree: . DB connect view save UPath settings context Curators and integrations ========================= .. autosummary:: :toctree: . curators integrations Examples, errors & setup ======================== .. autosummary:: :toctree: . examples errors setup Developer API ============= .. autosummary:: :toctree: . base core models """ # ruff: noqa: I001 # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc. __version__ = "2.4.2" import warnings as _warnings # through SpatialData _warnings.filterwarnings( "ignore", message="The legacy Dask DataFrame implementation is deprecated" ) from lamindb_setup._check_setup import _check_instance_setup from lamindb_setup._connect_instance import connect from lamindb_setup.core.upath import UPath from . import base, errors, setup _check_instance_setup(from_module="lamindb") from .core._functions import flow, step, tracked from ._view import view from .core._context import context from .core._settings import settings from .models import ( Artifact, Collection, Feature, Project, Reference, Run, Schema, Storage, Transform, ULabel, User, Space, Branch, Record, DB, ) from .models.save import save from . import core from . import integrations from . import curators from . import examples track = context._track finish = context._finish settings.__doc__ = """Global live settings (:class:`~lamindb.core.Settings`).""" context.__doc__ = """Global run context (:class:`~lamindb.core.Context`).""" from django.db.models import Q Param = Feature # backward compat __all__ = [ # data lineage "track", "finish", "step", "flow", # registries "Artifact", "Storage", "Transform", "Run", "Feature", "ULabel", "Schema", "Record", "User", "Collection", "Project", "Space", "Branch", "Reference", # other "connect", "view", "save", "UPath", "settings", "context", "DB", # curators and integrations "curators", "integrations", # examples, errors, setup "examples", "errors", "setup", # low-level functionality "base", "core", "models", ] ================================================ FILE: lamindb/_finish.py ================================================ from __future__ import annotations import builtins import re from datetime import datetime, timezone from time import sleep from typing import TYPE_CHECKING import lamindb_setup as ln_setup from lamin_utils import logger from lamin_utils._logger import LEVEL_TO_COLORS, LEVEL_TO_ICONS, RESET_COLOR from lamindb_setup.core.hashing import hash_dir, hash_file from lamindb.models import Artifact, Run, Transform is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) if TYPE_CHECKING: from pathlib import Path def get_save_notebook_message() -> str: # do not add bold() or any other complicated characters as then we can't match this # easily anymore in an html to strip it out return f"please hit {get_shortcut()} to save the notebook in your editor" def get_save_notebook_message_retry() -> str: return f"{get_save_notebook_message()} and re-run finish()" # this code was originally in nbproject by the same authors def check_consecutiveness( nb, calling_statement: str = None, silent_success: bool = True ) -> bool: """Check whether code cells have been executed consecutively. Needs to be called in the last code cell of a notebook. Otherwise raises `RuntimeError`. Returns cell transitions that violate execution at increments of 1 as a list of tuples. Args: nb: Notebook content. calling_statement: The statement that calls this function. """ cells = nb.cells violations = [] prev = 0 ccount = 0 # need to initialize because notebook might note have code cells # and below, we check if ccount is None for cell in cells: cell_source = "".join(cell["source"]) if cell["cell_type"] != "code" or cell_source == "": continue if calling_statement is not None and calling_statement in cell_source: continue ccount = cell["execution_count"] if ccount is None or prev is None or ccount - prev != 1: violations.append((prev, ccount)) prev = ccount # ignore the very last code cell of the notebook # `check_consecutiveness` is being run during publish if `last_cell`` is True # hence, that cell has ccount is None if ccount is None: violations.pop() any_violations = len(violations) > 0 if any_violations: logger.warning(f"cells {violations} were not run consecutively") elif not silent_success: logger.success("cell execution numbers increase consecutively") return not any_violations def get_shortcut() -> str: import platform return "CMD + s" if platform.system() == "Darwin" else "CTRL + s" def get_seconds_since_modified(filepath) -> float: return datetime.now().timestamp() - filepath.stat().st_mtime def save_run_logs(run: Run, save_run: bool = False) -> None: logs_path = ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt" if logs_path.exists(): if run.report is not None: logger.important("overwriting run.report") artifact = Artifact( # type: ignore logs_path, description=f"log streams of run {run.uid}", kind="__lamindb_run__", run=False, ) artifact.save(upload=True, print_progress=False) run.report = artifact if save_run: # defaults to false because is slow run.save() # this is from the get_title function in nbproject # should be moved into lamindb sooner or later def prepare_notebook( nb, strip_title: bool = False, ) -> str | None: title_found = False for cell in nb.cells: cell.metadata.clear() # strip cell metadata if not title_found and cell["cell_type"] == "markdown": lines = cell["source"].split("\n") for i, line in enumerate(lines): if line.startswith("# "): line.lstrip("#").strip(" .").strip() title_found = True if strip_title: lines.pop(i) cell["source"] = "\n".join(lines) # strip logging message about saving notebook in editor # this is normally the last cell if cell["cell_type"] == "code" and ".finish(" in cell["source"]: for output in cell["outputs"]: if "to save the notebook in your editor" in output.get("text", ""): cell["outputs"] = [] break return None def notebook_to_report(notebook_path: Path, output_path: Path) -> None: import nbformat import traitlets.config as config from nbconvert import HTMLExporter with open(notebook_path, encoding="utf-8") as f: notebook = nbformat.read(f, as_version=4) prepare_notebook(notebook, strip_title=True) notebook.metadata.clear() # strip notebook metadata # if we were to export as ipynb, the following two lines would do it # with open(output_path, "w", encoding="utf-8") as f: # nbformat.write(notebook, f) # instead we need all this code c = config.Config() c.HTMLExporter.preprocessors = [] c.HTMLExporter.exclude_input_prompt = True c.HTMLExporter.exclude_output_prompt = True c.HTMLExporter.anchor_link_text = " " html_exporter = HTMLExporter(config=c) html, _ = html_exporter.from_notebook_node(notebook) output_path.write_text(html, encoding="utf-8") def notebook_to_script( # type: ignore title: str, notebook_path: Path, script_path: Path | None = None ) -> None | str: import jupytext notebook = jupytext.read(notebook_path) notebook.metadata.clear() py_content = jupytext.writes(notebook, fmt="py:percent") # remove global metadata header py_content = re.sub(r"^# ---\n.*?# ---\n\n", "", py_content, flags=re.DOTALL) # replace title py_content = py_content.replace(f"# # {title}", "#") if script_path is None: return py_content else: script_path.write_text(py_content, encoding="utf-8") def clean_r_notebook_html(file_path: Path) -> tuple[str | None, Path]: import re cleaned_content = file_path.read_text() # remove title from content pattern_title = r"(.*?)" title_match = re.search(pattern_title, cleaned_content) title_text = None if title_match: title_text = title_match.group(1) pattern_h1 = f"]*>{re.escape(title_text)}" cleaned_content = re.sub(pattern_title, "", cleaned_content) cleaned_content = re.sub(pattern_h1, "", cleaned_content) # remove error message from content if "to save the notebook in your editor" in cleaned_content: orig_error_message = f"! {get_save_notebook_message_retry()}" # coming up with the regex for this is a bit tricky due to all the # escape characters we'd need to insert into the message; hence, # we do this with a replace() instead cleaned_content = cleaned_content.replace(orig_error_message, "") if "to save the notebook in your editor" in cleaned_content: orig_error_message = orig_error_message.replace( " finish()", "\nfinish()" ) # RStudio might insert a newline cleaned_content = cleaned_content.replace(orig_error_message, "") cleaned_path = file_path.parent / (f"{file_path.stem}.cleaned{file_path.suffix}") cleaned_path.write_text(cleaned_content, encoding="utf-8") return title_text, cleaned_path def check_filepath_recently_saved(filepath: Path, is_finish_retry: bool) -> bool: # the recently_saved_time needs to be very low for the first check # because an accidental save (e.g. via auto-save) might otherwise lead # to upload of an outdated notebook # also see implementation for R notebooks below offset_saved_time = 0.3 if not is_finish_retry else 20 for retry in range(30): recently_saved_time = offset_saved_time + retry # sleep time is 1 sec if get_seconds_since_modified(filepath) > recently_saved_time: if retry == 0: prefix = f"{LEVEL_TO_COLORS[20]}{LEVEL_TO_ICONS[20]}{RESET_COLOR}" print(f"{prefix} {get_save_notebook_message()}", end=" ") elif retry == 9: print(".", end="\n") elif retry == 4: print(". still waiting ", end="") else: print(".", end="") sleep(1) else: if retry > 0: prefix = f"{LEVEL_TO_COLORS[25]}{LEVEL_TO_ICONS[25]}{RESET_COLOR}" print(f" {prefix}") # filepath was recently saved, return True return True # if we arrive here, no save event occured, return False return False def save_context_core( *, run: Run | None, transform: Transform, filepath: Path, finished_at: bool = False, skip_save_report: bool = False, ignore_non_consecutive: bool | None = None, from_cli: bool = False, is_retry: bool = False, notebook_runner: str | None = None, message_prefix: str = "go to", ) -> str | None: import lamindb as ln from lamindb.models import ( format_field_value, # needs to come after lamindb was imported because of CLI use ) ln.settings.verbosity = "success" # for scripts, things are easy is_consecutive = True is_ipynb = filepath.suffix == ".ipynb" is_r_notebook = filepath.suffix in {".qmd", ".Rmd"} source_code_path = filepath report_path: Path | None = None save_source_code_and_report = filepath.exists() if ( is_run_from_ipython and notebook_runner != "nbconvert" and filepath.exists() ): # python notebooks in interactive session if is_ipynb: # ignore this for py:percent notebooks import nbproject # it might be that the user modifies the title just before ln.finish() if (nbproject_title := nbproject.meta.live.title) != transform.description: transform.description = nbproject_title transform.save() if not ln_setup._TESTING: save_source_code_and_report = check_filepath_recently_saved( filepath, is_retry ) if not save_source_code_and_report and not is_retry: logger.warning(get_save_notebook_message_retry()) return "retry" elif not save_source_code_and_report: logger.warning( "the notebook on disk wasn't saved within the last 10 sec" ) if is_ipynb and filepath.exists(): # could be from CLI outside interactive session try: import jupytext # noqa: F401 from nbproject.dev import ( read_notebook, ) except ImportError: logger.error("install nbproject & jupytext: pip install nbproject jupytext") return None notebook_content = read_notebook(filepath) # type: ignore if not ignore_non_consecutive: # ignore_non_consecutive is None or False is_consecutive = check_consecutiveness( notebook_content, calling_statement=".finish(" ) if not is_consecutive: response = "n" # ignore_non_consecutive == False if ignore_non_consecutive is None: # only print warning response = "y" # we already printed the warning else: # ask user to confirm response = input( " Do you still want to proceed with finishing? (y/n) " ) if response != "y": return "aborted-non-consecutive" # write the report report_path = ln_setup.settings.cache_dir / filepath.name.replace( ".ipynb", ".html" ) notebook_to_report(filepath, report_path) # write the source code source_code_path = ln_setup.settings.cache_dir / filepath.name.replace( ".ipynb", ".py" ) notebook_to_script(transform.description, filepath, source_code_path) elif is_ipynb and not filepath.exists(): logger.warning("notebook file does not exist in compute environment") elif is_r_notebook: if filepath.with_suffix(".nb.html").exists(): report_path = filepath.with_suffix(".nb.html") elif filepath.with_suffix(".html").exists(): report_path = filepath.with_suffix(".html") else: logger.warning( f"no html report found; to attach one, create an .html export for your {filepath.suffix} file and then run: lamin save {filepath}" ) if report_path is not None and is_r_notebook and not from_cli: # R notebooks # see comment above in check_filepath_recently_saved recently_saved_time = 0.3 if not is_retry else 20 if get_seconds_since_modified(report_path) > recently_saved_time: # the automated retry solution of Jupyter notebooks does not work in RStudio because the execution of the notebook cell # seems to block the event loop of the frontend if not is_retry: logger.warning(get_save_notebook_message_retry()) return "retry" else: logger.warning( "the notebook on disk hasn't been saved within the last 20 sec" ) save_source_code_and_report = False ln.settings.creation.artifact_silence_missing_run_warning = True # save source code if save_source_code_and_report: return_code = transform._update_source_code_from_path(source_code_path) if return_code == "rerun-the-notebook": return "rerun-the-notebook" if run is not None: base_path = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}" paths = [base_path / "run_env_pip.txt", base_path / "r_environment.txt"] existing_paths = [path for path in paths if path.exists()] if len(existing_paths) == 2: # let's not store the python environment for an R session for now existing_paths = [base_path / "r_environment.txt"] if existing_paths: overwrite_env = True if run.environment_id is not None and from_cli: logger.important("run.environment is already saved, ignoring") overwrite_env = False if overwrite_env: # Use directory if multiple files exist, otherwise use the single file artifact_path: Path = ( base_path if len(existing_paths) > 1 else existing_paths[0] ) # Set description based on what we're saving if len(existing_paths) == 1: if existing_paths[0].name == "run_env_pip.txt": description = "requirements.txt" elif existing_paths[0].name == "r_environment.txt": description = "r_environment.txt" size, env_hash, _ = hash_file(artifact_path) else: description = "environments" size, env_hash, _, _ = hash_dir(artifact_path) artifact = ( ln.Artifact.objects.filter(hash=env_hash) .exclude( size=0 ) # exclude empty files, which may occur for one reason or another .one_or_none() ) new_env_artifact = artifact is None if new_env_artifact: if size > 0: artifact = ln.Artifact( artifact_path, description=description, kind="__lamindb_run__", run=False, ) artifact.save(upload=True, print_progress=False) else: logger.warning( "environment file is empty, skipping linking an environment" ) run.environment = artifact if new_env_artifact: logger.debug(f"saved run.environment: {run.environment}") # set finished_at if finished_at and run is not None: if not from_cli: update_finished_at = True else: update_finished_at = run.finished_at is None if update_finished_at: run.finished_at = datetime.now(timezone.utc) # track report and set is_consecutive if save_source_code_and_report and not skip_save_report: if run is not None: # do not save a run report if executing through nbconvert if report_path is not None and notebook_runner != "nbconvert": if is_r_notebook: title_text, report_path = clean_r_notebook_html(report_path) if title_text is not None: transform.description = title_text if run.report_id is not None: _, hash, _ = hash_file(report_path) # ignore hash_type for now if hash != run.report.hash: response = input( f"You are about to overwrite an existing report (hash '{run.report.hash}') for Run('{run.uid}'). Proceed? (y/n) " ) if response == "y": run.report.replace(report_path) run.report.save(upload=True, print_progress=False) else: logger.important("keeping old report") else: logger.important("report is already saved") else: report_file = ln.Artifact( # type: ignore report_path, description=f"Report of run {run.uid}", kind="__lamindb_run__", # hidden file run=False, ) report_file.save(upload=True, print_progress=False) run.report = report_file if is_r_notebook: # this is the "cleaned" report report_path.unlink() logger.debug( f"saved transform.latest_run.report: {transform.latest_run.report}" ) run._is_consecutive = is_consecutive if report_path is not None and notebook_runner == "nbconvert": logger.important(f"to save the notebook html, run: lamin save {filepath}") # save both run & transform records if we arrive here if run is not None: run.save() transform_id_prior_to_save = transform.id transform.save() # this in-place updates the state of transform upon hash collision if transform.id != transform_id_prior_to_save: # the hash existed and we're actually back to the previous version # hence, this was in fact a run of the previous transform rather than of # the new transform # this can happen in interactively executed notebooks with a pro-active version bump in case it turns out that the user didn't make a change to the notebook run.transform = transform run.save() ln.Transform.get(transform_id_prior_to_save).delete(permanent=True) # finalize if finished_at and not from_cli and run is not None: run_time = run.finished_at - run.started_at days = run_time.days seconds = run_time.seconds hours = seconds // 3600 minutes = (seconds % 3600) // 60 secs = seconds % 60 formatted_run_time = ( f"{days}d" if days != 0 else "" + f"{hours}h" if hours != 0 else "" + f"{minutes}m" if minutes != 0 else "" + f"{secs}s" ) logger.important( f"finished Run('{run.uid}') after {formatted_run_time} at {format_field_value(run.finished_at)}" ) if ln_setup.settings.instance.is_on_hub: instance_slug = ln_setup.settings.instance.slug if save_source_code_and_report: ui_url = ln_setup.settings.instance.ui_url logger.important( f"{message_prefix}: {ui_url}/{instance_slug}/transform/{transform.uid}" ) if finished_at and not from_cli and save_source_code_and_report: thing = "notebook" if (is_ipynb or is_r_notebook) else "script" logger.important( f"to update your {thing} from the CLI, run: lamin save {filepath}" ) if not save_source_code_and_report: logger.warning( f"did *not* save source code and report -- to do so, run: lamin save {filepath}" ) return None ================================================ FILE: lamindb/_secret_redaction.py ================================================ from __future__ import annotations import re REDACTED_SECRET_VALUE = "***REDACTED***" # noqa: S105 SENSITIVE_PARAM_KEY_PATTERN = re.compile( r"(^|[_\-.])(api[_-]?key|access[_-]?key|secret|token|password|passwd|private[_-]?key|client[_-]?secret)($|[_\-.])" ) # Match only quoted literals in assignments, e.g.: # - my_secret = "value" # - my.secret: "value" # - mySecret := "value" # We intentionally do not match unquoted RHS values to avoid false positives like # type annotations (`api_key: str`) or variable forwarding (`api_key=api_key`). _KEY_VALUE_ASSIGNMENT_PATTERN = re.compile( r"(?P(?P[A-Za-z_][A-Za-z0-9_.\-]*)\s*(?P:=|=|:)\s*)" r"(?P(?P['\"`])(?P.*?)(?P=quote))" ) # Match: os.environ["API_KEY"] = "value" _ENV_ASSIGNMENT_PATTERN = re.compile( r"(?Pos\.environ\[\s*(?P['\"])(?P[^'\"]+)(?P=kquote)\s*\]\s*=\s*)" r"(?P(?P['\"`])(?P.*?)(?P=quote))" ) # Match: {"client_secret": "value"} _QUOTED_KEY_ASSIGNMENT_PATTERN = re.compile( r"(?P(?P['\"])(?P[^'\"]+)(?P=kquote)\s*:\s*)" r"(?P(?P['\"`])(?P.*?)(?P=quote))" ) # We intentionally treat env lookups as safe/re-runnable references, not embedded secrets. # Examples that should remain unchanged: # - api_key = os.getenv("OPENAI_API_KEY") # - api_key = getenv("OPENAI_API_KEY") # - api_key = os.environ["OPENAI_API_KEY"] # - api_key = os.environ.get("OPENAI_API_KEY") _ENV_REFERENCE_VALUE_PATTERN = re.compile( r"^(os\.getenv\(.+\)|getenv\(.+\)|os\.environ\[[^\]]+\]|os\.environ\.get\(.+\))$" ) # Match PostgreSQL URLs that include inline credentials: # - postgresql://user:password@host:5432/dbname # - postgres://user:password@host/dbname?sslmode=require _POSTGRES_CREDENTIALS_URL_PATTERN = re.compile( r"^postgres(?:ql)?://[^:@/\s]+:[^@/\s]+@[^/\s]+(?:/[^\s]*)?$", re.IGNORECASE, ) def normalize_sensitive_key_name(key: str) -> str: normalized_key = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", key) normalized_key = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized_key).lower() return normalized_key def is_sensitive_param_key(key: str) -> bool: return bool(SENSITIVE_PARAM_KEY_PATTERN.search(normalize_sensitive_key_name(key))) def is_sensitive_param_value(value: object) -> bool: if not isinstance(value, str): return False return bool(_POSTGRES_CREDENTIALS_URL_PATTERN.match(value.strip())) def _redact_assignment_match(match: re.Match[str]) -> str: key = match.group("key") quoted_value = match.group("quoted") if not is_sensitive_param_key(key) and not is_sensitive_param_value(quoted_value): return match.group(0) # Redact only hardcoded values, not environment-based references. # This preserves reproducibility for source code that reads secrets from env vars. raw_value = match.group("value") if _ENV_REFERENCE_VALUE_PATTERN.match(raw_value): return match.group(0) quote = match.group("quote") redacted_value = ( f"{quote}{REDACTED_SECRET_VALUE}{quote}" if quote is not None else REDACTED_SECRET_VALUE ) return f"{match.group('prefix')}{redacted_value}" def redact_secrets_in_source_code(source_code: str) -> tuple[str, int]: redaction_count = 0 def replace_with_count(match: re.Match[str]) -> str: nonlocal redaction_count replaced = _redact_assignment_match(match) if replaced != match.group(0): redaction_count += 1 return replaced redacted = _ENV_ASSIGNMENT_PATTERN.sub(replace_with_count, source_code) redacted = _KEY_VALUE_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted) redacted = _QUOTED_KEY_ASSIGNMENT_PATTERN.sub(replace_with_count, redacted) return redacted, redaction_count ================================================ FILE: lamindb/_view.py ================================================ from __future__ import annotations import builtins import importlib import inspect from typing import TYPE_CHECKING from lamin_utils import colors, logger from lamindb_setup import settings from lamindb_setup._init_instance import get_schema_module_name from lamindb.models import Feature, JsonValue, SQLRecord from .models.feature import serialize_pandas_dtype if TYPE_CHECKING: import pandas as pd is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) def display_df_with_descriptions( df: pd.DataFrame, descriptions: dict[str, str] | None = None ): from IPython.display import HTML, display if descriptions is None: display(df) return None # Start building HTML table html = '' # Create header with title and description rows html += "" # Column names row html += "" html += '' # Index header for col in df.columns: html += f'' html += "" # Descriptions row html += "" html += f'' # Index column for col in df.columns: desc = descriptions.get(col, "") html += f'' html += "" html += "" # Add body rows html += "" for idx, row in df.iterrows(): html += "" html += f'' # Index value for col in df.columns: html += f"" html += "" html += "" html += "
{col}
{df.index.name or ""}{desc}
{idx}{row[col]}
" # Add CSS styles styled_html = f""" {html} """ return display(HTML(styled_html)) def view( *, limit: int = 7, modules: str | None = None, registries: list[str] | None = None, df: pd.DataFrame | None = None, ) -> None: """View metadata. Args: limit: Display the latest `n` records modules: schema module to view. Default's to `None` and displays all registry modules. registries: List of SQLRecord names. Defaults to `None` and lists all registries. df: A DataFrame to display. """ if df is not None: descriptions = { col_name: serialize_pandas_dtype(dtype) for col_name, dtype in df.dtypes.to_dict().items() } feature_dtypes = dict(Feature.objects.values_list("name", "dtype")) descriptions.update(feature_dtypes) display_df_with_descriptions(df, descriptions) return None if is_run_from_ipython: from IPython.display import display as show else: show = logger.print if modules is not None: module_names = [modules] else: module_names = ["core"] + list(settings.instance.modules) for module_name in module_names: schema_module = importlib.import_module(get_schema_module_name(module_name)) # the below is necessary because a schema module might not have been # explicitly accessed importlib.reload(schema_module) all_registries = { registry for registry in schema_module.__dict__.values() if inspect.isclass(registry) and issubclass(registry, SQLRecord) and registry is not SQLRecord } if module_name == "core": all_registries.update({JsonValue}) if registries is not None: filtered_registries = { registry for registry in all_registries if registry.__name__ in registries } else: filtered_registries = all_registries if len(module_names) > 1: section = f"* module: {colors.green(colors.bold(module_name))} *" section_no_color = f"* module: {module_name} *" logger.print("*" * len(section_no_color)) logger.print(section) logger.print("*" * len(section_no_color)) for registry in sorted(filtered_registries, key=lambda x: x.__name__): df = registry.to_dataframe(limit=limit) if df.shape[0] > 0: logger.print(colors.blue(colors.bold(registry.__name__))) show(df) ================================================ FILE: lamindb/base/__init__.py ================================================ """Base library. Is available also when no instance is setup. Modules ------- .. autosummary:: :toctree: . uids types fields dtypes utils """ from . import dtypes, fields, types, uids, utils from .utils import deprecated, doc_args __all__ = ["dtypes", "fields", "types", "uids", "utils"] ================================================ FILE: lamindb/base/dtypes.py ================================================ """Dtype utils. .. autofunction:: check_dtype """ from datetime import datetime from typing import Any, Callable, Iterable import numpy as np def is_list_of_type(value: Any, expected_type: Any) -> bool: """Helper function to check if a value is either of expected_type or a list of that type, or a mix of both in a nested structure.""" if isinstance(value, Iterable) and not isinstance(value, (str, bytes)): # handle nested lists recursively return all(isinstance(item, expected_type) for item in value) return False def check_dtype(expected_type: Any, nullable: bool) -> Callable: """Creates a check function for Pandera that validates a column's dtype. Supports both standard dtype checking and mixed list/single values for the same type. For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats. Args: expected_type: String identifier for the expected type ('int', 'float', 'num', 'str') Returns: A function that checks if a series has the expected dtype or contains mixed types """ import pandas as pd from lamindb.models.query_set import SQLRecordList def check_function(series): # empty series are considered valid if feature is nullable # the issue is that nullable in Pandera controls whether None/NaN values are allowed in the column, not whether the column can be empty (0 rows). # so "col": [1, 2, None, 4] is correctly handled by pandera nullable=True, but an empty column "col": [] is not. if nullable and series.isnull().all(): return True # first check if the series is entirely of the expected dtype (fast path) if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype): return True elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype): return True elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype): return True elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype): return True elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype): return True elif expected_type == "url" and pd.api.types.is_string_dtype(series.dtype): return True elif expected_type == "bool" and pd.api.types.is_bool_dtype(series.dtype): return True # if we're here, it might be a mixed column with object dtype # need to check each value individually if series.dtype == "object" and expected_type.startswith("list"): expected_type_member = expected_type.replace("list[", "").removesuffix("]") if expected_type_member == "int": return series.apply(lambda x: is_list_of_type(x, int)).all() elif expected_type_member == "float": return series.apply(lambda x: is_list_of_type(x, float)).all() elif expected_type_member == "bool": return series.apply(lambda x: is_list_of_type(x, bool)).all() elif expected_type_member == "num": # for numeric, accept either int or float return series.apply(lambda x: is_list_of_type(x, (int, float))).all() elif ( expected_type_member == "str" or expected_type_member == "path" or expected_type_member == "url" or expected_type_member.startswith("cat[") ): return series.apply(lambda x: is_list_of_type(x, str)).all() elif expected_type_member == "list": return series.apply( lambda x: isinstance(x, (list, np.ndarray, SQLRecordList)) ).all() # if we get here, the validation failed return False return check_function def is_valid_datetime_str(date_string: str) -> bool | str: try: dt = datetime.fromisoformat(date_string) return dt.isoformat() except ValueError: return False def is_iterable_of_sqlrecord(value: Any): from lamindb.models import SQLRecord return isinstance(value, Iterable) and isinstance(next(iter(value)), SQLRecord) ================================================ FILE: lamindb/base/fields.py ================================================ """Fields. Django fields with modified default arguments. .. autoclass:: CharField .. autoclass:: TextField .. autoclass:: ForeignKey .. autoclass:: BooleanField .. autoclass:: DateField .. autoclass:: DateTimeField .. autoclass:: BigIntegerField .. autoclass:: IntegerField .. autoclass:: OneToOneField .. autoclass:: FloatField .. autoclass:: DecimalField .. autoclass:: BinaryField .. autoclass:: JSONField .. autoclass:: EmailField .. autoclass:: TimeField .. autoclass:: SlugField .. autoclass:: URLField .. autoclass:: UUIDField .. autoclass:: PositiveIntegerField .. autoclass:: PositiveSmallIntegerField .. autoclass:: SmallIntegerField .. autoclass:: GenericIPAddressField .. autoclass:: DurationField """ from django.db import models class CharField(models.CharField): """Custom `CharField` with default values for `blank`, `default`, and `max_length`. Django default values for `CharField` are `blank=False`, `default=""`, undefined `max_length`. """ def __init__(self, max_length: int = 255, **kwargs): kwargs["max_length"] = max_length # Set max_length in kwargs kwargs.setdefault("blank", True) kwargs.setdefault("default", None) super().__init__(**kwargs) # Pass all arguments as kwargs class TextField(models.TextField): """Custom `TextField` with default values for `blank` and `default`. Django default values for `TextField` are `blank=False`, `default=''`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) kwargs.setdefault("default", None) super().__init__(*args, **kwargs) class ForeignKey(models.ForeignKey): """Custom `ForeignKey` with default values for `blank`. Django default value for `ForeignKey` `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) # fix doc string that otherwise errors ForeignKey.get_extra_descriptor_filter.__doc__ = ( ForeignKey.get_extra_descriptor_filter.__doc__.replace( ".filter(**kwargs)", "`.filter(**kwargs)`" ) ) class BooleanField(models.BooleanField): """Custom `BooleanField` with default values for `blank` and `default`. Django default values for `BooleanField` are `blank=False`, `default=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) kwargs.setdefault("default", None) super().__init__(*args, **kwargs) class DateField(models.DateField): """Custom `DateField` with default values for `blank`. Django default values for `DateField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class DateTimeField(models.DateTimeField): """Custom `DateTimeField` with default values for `blank`. Django default values for `DateTimeField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class BigIntegerField(models.BigIntegerField): """Custom `BigIntegerField` with default values for `blank`. Django default values for `BigIntegerField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) kwargs.setdefault("default", None) super().__init__(*args, **kwargs) class IntegerField(models.IntegerField): """Custom `IntegerField` with default values for `blank`. Django default values for `IntegerField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class OneToOneField(models.OneToOneField): """Custom `OneToOneField` with default values for `blank`. Django default values for `OneToOneField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class FloatField(models.FloatField): """Custom `FloatField` with default values for `blank`. Django default values for `FloatField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class DecimalField(models.DecimalField): """Custom `DecimalField` with default values for `blank`. Django default values for `DecimalField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class JSONField(models.JSONField): """Custom `JSONField` with default values for `blank`. Django default values for `JSONField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class DurationField(models.DurationField): """Custom `DurationField` with default values for `blank`. Django default values for `DurationField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class URLField(models.URLField): """Custom `URLField` with default values for `blank`. Django default values for `URLField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class EmailField(models.EmailField): """Custom `EmailField` with default values for `blank`. Django default values for `EmailField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class TimeField(models.TimeField): """Custom `TimeField` with default values for `blank`. Django default values for `TimeField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class SlugField(models.SlugField): """Custom `SlugField` with default values for `blank`. Django default values for `SlugField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class UUIDField(models.UUIDField): """Custom `UUIDField` with default values for `blank`. Django default values for `UUIDField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class PositiveIntegerField(models.PositiveIntegerField): """Custom `PositiveIntegerField` with default values for `blank`. Django default values for `PositiveIntegerField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class PositiveSmallIntegerField(models.PositiveSmallIntegerField): """Custom `PositiveSmallIntegerField` with default values for `blank`. Django default values for `PositiveSmallIntegerField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class SmallIntegerField(models.SmallIntegerField): """Custom `SmallIntegerField` with default values for `blank`. Django default values for `SmallIntegerField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class BinaryField(models.BinaryField): """Custom `BinaryField` with default values for `blank`. Django default values for `BinaryField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) class GenericIPAddressField(models.GenericIPAddressField): """Custom `GenericIPAddressField` with default values for `blank`. Django default values for `GenericIPAddressField` are `blank=False`. """ def __init__(self, *args, **kwargs): kwargs.setdefault("blank", True) super().__init__(*args, **kwargs) ================================================ FILE: lamindb/base/ids.py ================================================ from .uids import * # noqa: F403 ================================================ FILE: lamindb/base/types.py ================================================ """Base types. Central object types -------------------- .. autoclass:: ArtifactKind .. autoclass:: TransformKind .. autoclass:: BlockKind .. autoclass:: BranchStatus .. autoclass:: RunStatus .. autoclass:: DtypeStr Basic types ----------- .. autoclass:: AnyPathStr .. autoclass:: StrField .. autoclass:: ListLike .. autoclass:: FieldAttr """ from __future__ import annotations import datetime from typing import TYPE_CHECKING, Literal, Union import numpy as np from django.db.models.query_utils import DeferredAttribute as FieldAttr from lamindb_setup.types import AnyPathStr # noqa: F401 if TYPE_CHECKING: import pandas as pd # need to use Union because __future__.annotations doesn't do the job here <3.10 # typing.TypeAlias, >3.10 on but already deprecated # pd.Series as string to avoid importing pandas at runtime ListLike = Union[list[str], "pd.Series", np.ndarray] StrField = Union[str, FieldAttr] # typing.TypeAlias TransformKind = Literal["pipeline", "notebook", "script", "function"] TransformType = TransformKind # backward compat ArtifactKind = Literal[ "dataset", "model", "plan", "__lamindb_run__", "__lamindb_config__" ] BlockKind = Literal["readme", "comment"] """Block kind, a `README.md`-type page or comment. Any block expects Markdown as the formatting language. """ BranchStatus = Literal["standalone", "draft", "review", "merged", "closed"] """Branch status. ============= ===== ================================================== status code description ============= ===== ================================================== `closed` -2 Change Request was closed without merging. `merged` -1 The branch was merged into another branch. `standalone` 0 A standalone branch without Change Request. `draft` 1 Change Request exists but is not ready for review. `review` 2 Change Request is ready for review. ============= ===== ================================================== The database stores the branch status as an integer code in field `_status_code`. """ RunStatus = Literal[ "scheduled", "restarted", "started", "completed", "errored", "aborted" ] """Run status. =========== ===== =========================== status code description =========== ===== =========================== `scheduled` -3 The run is scheduled. `restarted` -2 The run was restarted. `started` -1 The run has started. `completed` 0 The run completed successfully. `errored` 1 The run ended with an error. `aborted` 2 The run was aborted. =========== ===== =========================== The database stores the run status as an integer code in field `_status_code`. """ RUN_STATUS_TO_CODE: dict[RunStatus, int] = { "scheduled": -3, "restarted": -2, "started": -1, "completed": 0, "errored": 1, "aborted": 2, } RUN_CODE_TO_STATUS: dict[int, RunStatus] = { code: status for status, code in RUN_STATUS_TO_CODE.items() } BRANCH_STATUS_TO_CODE: dict[BranchStatus, int] = { "closed": -2, "merged": -1, "standalone": 0, "draft": 1, "review": 2, } BRANCH_CODE_TO_STATUS: dict[int, BranchStatus] = { code: status for status, code in BRANCH_STATUS_TO_CODE.items() } DtypeObject = int | float | str | bool | datetime.date | datetime.datetime | dict DtypeStr = Literal[ "num", # numericals "int", # integer / numpy.integer "float", # float "str", # string "bool", # boolean "datetime", # datetime "date", # date "dict", # dictionary "path", # path, validated as str, but specially treated in the UI "url", # URL, validated as str, but specially treated in the UI "object", # this is a pandas input dtype, we're only using it for complicated types, not for strings; consciously currently not documented ] """String-serialized representations of common data types. ============ ============ ================================================= description lamindb pandas ============ ============ ================================================= numerical `"num"` `int | float` integer `"int"` `int64 | int32 | int16 | int8 | uint | ...` float `"float"` `float64 | float32 | float16 | float8 | ...` string `"str"` `object` boolean `"bool"` `boolean | bool` datetime `"datetime"` `datetime` date `"date"` `object` (pandera requires an ISO-format string, convert with `df["date"] = df["date"].dt.date`) dictionary `"dict"` `object` path `"path"` `str` (pandas does not have a dedicated path type, validated as `str`) url `"url"` `str` (pandas does not have a dedicated url type, validated as `str`) ============ ============ ================================================= .. admonition:: Categorical and relational data types These are **not** contained in the `DTypeStr` `Literal`. For any categorical, you can restrict the permissible values to the values defined in a registry. When serializing this to a string, then `'cat[ULabel]'` or `'cat[bionty.CellType]'` indicate that permissible values are stored in the `name` field of the `ULabel` or `CellType` registry, respectively. You can also restrict to sub-types defined in registries via the `type` field, e.g., `'cat[ULabel[123456ABCDEFG]]'` indicates that values must be of the type with `uid="123456ABCDEFG"` within the `ULabel` registry. In LaminDB, categoricals define relationships with registries. See :class:`~lamindb.Feature` for more details. """ Dtype = DtypeStr # backward compat RegistryId = Literal[ "__lamindb_artifact__", "__lamindb_block__", "__lamindb_collection__", "__lamindb_feature__", "__lamindb_jsonvalue__", "__lamindb_project__", "__lamindb_record__", "__lamindb_run__", "__lamindb_schema__", "__lamindb_storage__", "__lamindb_transform__", "__lamindb_ulabel__", ] ================================================ FILE: lamindb/base/uids.py ================================================ """Universal IDs. Base generators =============== .. autofunction:: base26 .. autofunction:: base62 .. autofunction:: base64 UID generators ================ .. autofunction:: base62_8 .. autofunction:: base62_12 .. autofunction:: base62_16 .. autofunction:: base62_20 Collision probabilities ======================= 8 base62 characters (`62**8=2e+14`): ======= =========== n p_collision ======= =========== 100k 2e-05 1M 2e-03 ======= =========== 12 base62 characters (`62**12=3e+21`): ======= =========== n p_collision ======= =========== 100M 2e-06 1B 2e-04 ======= =========== 16 base62 characters (`62**16=5e+28`): ======= =========== n p_collision ======= =========== 1e12 7e-05 1e13 7e-03 ======= =========== 20 base62 characters (`62**20=7e+35`) roughly matches UUID (`2**122=5e+36`): ======= =========== n p_collision ======= =========== 1e16 7e-05 1e17 7e-03 ======= =========== See `source `__. """ import secrets import string def base64(n_char: int) -> str: """Random Base64 string.""" alphabet = string.digits + string.ascii_letters.swapcase() + "_" + "-" uid = "".join(secrets.choice(alphabet) for i in range(n_char)) return uid def base62(n_char: int) -> str: """Random Base62 string.""" alphabet = string.digits + string.ascii_letters.swapcase() uid = "".join(secrets.choice(alphabet) for i in range(n_char)) return uid def base26(n_char: int): """ASCII lowercase.""" alphabet = string.ascii_lowercase uid = "".join(secrets.choice(alphabet) for i in range(n_char)) return uid def base62_4() -> str: return base62(4) def base62_8() -> str: """Random Base62 string of length 8.""" return base62(8) def base62_12() -> str: """Random Base62 string of length 12.""" return base62(12) def base62_16() -> str: """Random Base62 string of length 16.""" return base62(16) def base62_20() -> str: """Random Base62 string of length 20.""" return base62(20) def base62_24() -> str: """Random Base62 string of length 24.""" return base62(24) ================================================ FILE: lamindb/base/users.py ================================================ user_id_cache = {} def _user_has_write_access() -> bool: from django.db import connection with connection.cursor() as cursor: cursor.execute(""" SELECT EXISTS ( SELECT 1 FROM check_access() chk WHERE chk.role in ('write', 'admin') ) """) return cursor.fetchone()[0] def current_user_id() -> int: import lamindb_setup as ln_setup from lamindb_setup import settings from lamindb_setup._init_instance import register_user from lamindb.errors import NoWriteAccess from lamindb.models import User def query_user_id(): if ln_setup.core.django.IS_MIGRATING: return 1 else: user = settings.user user_uid = user.uid try: user_id = User.objects.get(uid=user_uid).id except User.DoesNotExist: register_user(user) try: user_id = User.objects.get(uid=user_uid).id except User.DoesNotExist as e: isettings = settings.instance if isettings.is_read_only_connection: raise NoWriteAccess( "Unable to register a new user in the instance database " "because you have a read-only connection." ) from e if ( isettings._db_permissions == "jwt" and not _user_has_write_access() ): raise NoWriteAccess( "Unable to register a new user in the instance database " "because you don't have write access to any space or registry." ) from e raise e return user_id if settings._instance_exists: slug = settings.instance.slug if slug not in user_id_cache: user_id_cache[slug] = query_user_id() return user_id_cache[slug] else: return query_user_id() ================================================ FILE: lamindb/base/utils.py ================================================ """Utilities. .. autodecorator:: doc_args .. autodecorator:: deprecated .. autodecorator:: class_and_instance_method .. autodecorator:: strict_classmethod """ from functools import wraps from types import MethodType from lamindb_setup.core import deprecated, doc_args class class_and_instance_method: """Decorator to define a method that works both as class and instance method.""" def __init__(self, func): self.func = func wraps(func)(self) def __get__(self, instance, owner): if instance is None: # Called on the class return MethodType(self.func, owner) else: # Called on an instance return MethodType(self.func, instance) class strict_classmethod: """Decorator for a classmethod that raises an error when called on an instance.""" def __init__(self, func): self.func = func wraps(func)(self) def __get__(self, instance, owner): if instance is not None: # Called on an instance - raise immediately raise TypeError( f"{owner.__name__}.{self.func.__name__}() is a class method and must be called on the {owner.__name__} class, not on a {owner.__name__} object" ) # Called on the class - return bound method using MethodType return MethodType(self.func, owner) __all__ = [ "doc_args", "deprecated", "class_and_instance_method", "strict_classmethod", ] ================================================ FILE: lamindb/core/__init__.py ================================================ """Core library. Settings & context: .. autosummary:: :toctree: . Settings subsettings Context Artifact loaders: .. autosummary:: :toctree: . loaders Data loaders: .. autosummary:: :toctree: . MappedCollection Modules: .. autosummary:: :toctree: . storage logger """ from lamin_utils import logger from lamin_utils._inspect import InspectResult from .. import errors as exceptions # backward compat from ..base import types # backward compat from ..examples import datasets # backward compat from . import subsettings from ._context import Context from ._settings import Settings def __getattr__(name: str): # need to lazy import a few auxliary modules to maintain backward compatibility # none of them should have been eagerly imported in the first place import importlib if name == "loaders": loaders = importlib.import_module(".loaders", package=__name__) globals()[name] = loaders return loaders if name == "storage": storage = importlib.import_module(".storage", package=__name__) globals()[name] = storage return storage if name == "MappedCollection": from ._mapped_collection import MappedCollection globals()[name] = MappedCollection return MappedCollection raise AttributeError(f"module {__name__!r} has no attribute {name!r}") ================================================ FILE: lamindb/core/_compat.py ================================================ import importlib.util from typing import Any, Callable, TypeVar T = TypeVar("T") def is_package_installed(package_name: str) -> bool: spec = importlib.util.find_spec(package_name) return spec is not None def with_package(package_name: str, operation: Callable[[Any], T]) -> T: """Execute an operation that requires a specific package. Args: package_name: Package name (e.g., "mudata") operation: Function that takes the imported module and returns a result Examples: # For direct package functions result = with_package("mudata", lambda mod: mod.read_zarr(path)) """ try: module = importlib.import_module(package_name) except ImportError: raise ImportError( f"Package '{package_name}' is required but not installed. " f"Please install with: pip install {package_name}" ) from None return operation(module) def with_package_obj( obj: Any, class_name: str, package_name: str, operation: Callable[[Any], T] ) -> tuple[bool, T | None]: """Handle operations on objects that require specific packages. Args: obj: The object to operate on class_name: Expected class name (e.g., "MuData") package_name: Package that provides the class (e.g., "mudata") operation: Function to call with the object if package is available. Examples: # For instance methods handled, res = apply_class_func(dmem, "MuData", "mudata", lambda obj: obj.write(filepath)) """ if obj.__class__.__name__ == class_name: try: importlib.import_module(package_name) except ImportError: raise ImportError( f"Object appears to be {class_name} but '{package_name}' package is not installed. " f"Please install with: pip install {package_name}" ) from None result = operation(obj) return True, result return False, None ================================================ FILE: lamindb/core/_context.py ================================================ from __future__ import annotations import builtins import hashlib import os import signal import sys import threading import traceback from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, TextIO import lamindb_setup as ln_setup from django.db.models import Func, IntegerField, Q from lamin_utils._logger import logger from lamindb_setup.core.hashing import hash_file, hash_string from .._secret_redaction import ( REDACTED_SECRET_VALUE, is_sensitive_param_key, is_sensitive_param_value, redact_secrets_in_source_code, ) from ..base.uids import base62_12 from ..errors import InvalidArgument, TrackNotCalled, UpdateContext from ..models import Run, SQLRecord, Transform, format_field_value from ..models._feature_manager import infer_convert_dtype_key_value from ..models._is_versioned import bump_version as bump_version_function from ..models._is_versioned import ( increment_base62, ) from ._settings import settings from ._sync_git import get_transform_reference_from_git_repo from ._track_environment import track_python_environment if TYPE_CHECKING: from types import FrameType, TracebackType from lamindb.base.types import TransformKind from lamindb.models import Artifact, Branch, Project, Space is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) msg_path_failed = "failed to infer notebook path.\nfix: pass `path` to `ln.track()`" def get_key_from_module(caller_module: str) -> str: if "." in caller_module: key_from_module = f"pypackages/{caller_module.replace('.', '/')}.py" else: key_from_module = None return key_from_module def detect_and_process_source_code_file( *, path: str | Path | None, transform_kind: TransformKind | None = None, ) -> tuple[Path, TransformKind, str, str, str | None]: """Track source code file and determine transform metadata. For `.py` files, classified as "script". For `.Rmd` and `.qmd` files, classified as "notebook" because they typically come with an .html run report. Package vs script criterion: source code is part of a **package** if the caller's module name contains at least one `.` (module nesting goes beyond the filename). Otherwise it is a **script** (module nesting stops at the filename, e.g. `__main__`, `__mp_main__`, or a single top-level name). Args: path: Path to the source code file. If None, infers from call stack. Returns: Tuple of (path, transform_kind, reference, reference_type, key_from_module). - path: Path object to the source file - transform_kind: "script" or "notebook" - reference: Git reference URL if sync_git_repo is set, else None - reference_type: "url" if reference exists, else None - key_from_module: If caller is part of a package (`.` in __name__), `pypackages/module/path/to/file.py`; else None (key will be computed from dev_dir or path.name). Raises: NotImplementedError: If path cannot be determined from call stack. """ # for `.py` files, classified as "script" # for `.Rmd` and `.qmd` files, which we classify # as "notebook" because they typically come with an .html run report key_from_module: str | None = None if path is None: import inspect frame = inspect.stack()[2] path_str = frame[1] if not path_str or path_str.startswith("<"): raise NotImplementedError( "Cannot determine valid file path, pass manually via path (interactive sessions not yet supported)" ) path = Path(path_str) # package vs script: nesting beyond filename makes the file part of a python package caller_module = frame[0].f_globals.get("__name__", "__main__") key_from_module = get_key_from_module(caller_module) else: path = Path(path) # for Rmd and qmd, we could also extract the title # we don't do this for now as we're setting the title upon `ln.finish()` or `lamin save` # by extracting it from the html while cleaning it: see clean_r_notebook_html() # also see the script_to_notebook() in the CLI _load.py where the title is extracted # from the source code YAML and updated with the transform description # note that ipynb notebooks are handled in a separate function (_track_notebook()) if transform_kind is None: transform_kind = "notebook" if path.suffix in {".Rmd", ".qmd"} else "script" reference = None reference_type = None if settings.sync_git_repo is not None and path.suffix != ".ipynb": reference = get_transform_reference_from_git_repo(path) reference_type = "url" return path, transform_kind, reference, reference_type, key_from_module def get_uid_ext(version: str) -> str: from lamin_utils._base62 import encodebytes # merely zero-padding the nbproject version such that the base62 encoding is # at least 4 characters long doesn't yields sufficiently diverse hashes and # leads to collisions; it'd be nice because the uid_ext would be ordered return encodebytes(hashlib.md5(version.encode()).digest())[:4] # noqa: S324 def get_notebook_path() -> tuple[Path, str]: from nbproject.dev._jupyter_communicate import ( notebook_path as get_notebook_path, ) path = None try: path, env = get_notebook_path(return_env=True) except ValueError as ve: raise ve except Exception as error: raise RuntimeError(msg_path_failed) from error if path is None: raise RuntimeError(msg_path_failed) from None return Path(path), env # from https://stackoverflow.com/questions/61901628 def get_notebook_key_colab() -> str: from socket import gethostbyname, gethostname # type: ignore from requests import get # type: ignore ip = gethostbyname(gethostname()) # 172.28.0.12 try: key = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113 key = f"colab/{key}" except Exception: logger.warning( "could not get notebook key from Google Colab, using: colab/notebook.ipynb" ) key = "colab/notebook.ipynb" return key def get_cli_call() -> tuple[str, str] | None: """Returns (tool_name, args) when invoked as a script with CLI arguments. Returns None if not run as a script (e.g., in Jupyter, interactive shell) or when no arguments were passed. """ if len(sys.argv) > 1 and sys.argv[0] and not is_run_from_ipython: return Path(sys.argv[0]).name, " ".join(sys.argv[1:]) return None def pretty_pypackages(dependencies: dict) -> str: deps_list = [] for pkg, ver in dependencies.items(): if ver != "": deps_list.append(pkg + f"=={ver}") else: deps_list.append(pkg) deps_list.sort() return " ".join(deps_list) def last_non_empty_r_block(line: str) -> str: for block in reversed(line.split("\r")): if block: return block return "" class LogStreamHandler: def __init__(self, log_stream: TextIO, file: TextIO, use_buffer: bool): self.log_stream = log_stream self.file = file self._buffer = "" self._use_buffer = use_buffer def write(self, data: str) -> int: data_length = len(data) self.log_stream.write(data) if self.file.closed: return data_length if not self._use_buffer: self.file.write(data) self.file.flush() return data_length self._buffer += data # write only the last part of a line with carriage returns while "\n" in self._buffer: if self.file.closed: return data_length line, self._buffer = self._buffer.split("\n", 1) self.file.write(last_non_empty_r_block(line) + "\n") self.file.flush() return data_length def flush(self): self.log_stream.flush() if not self.file.closed: self.file.flush() # https://laminlabs.slack.com/archives/C07DB677JF6/p1759423901926139 # other tracking frameworks like W&B use our output stream and expect # certain functions like isatty to be available def isatty(self) -> bool: return False # .flush is sometimes (in jupyter etc.) called after every .write # this needs to be called only at the end def flush_buffer(self): if not self.file.closed and self._buffer: self.file.write(last_non_empty_r_block(self._buffer)) self._buffer = "" self.flush() class LogStreamTracker: def __init__(self): self.original_stdout = None self.original_stderr = None self.log_file = None self.is_cleaning_up = False self.original_excepthook: Callable[ [type[BaseException], BaseException, TracebackType | None], Any ] = sys.excepthook self.original_signal_handlers: dict[ signal.Signals, Callable[[int, FrameType | None], Any] | int ] = {} if threading.current_thread() == threading.main_thread(): self.original_signal_handlers[signal.SIGTERM] = signal.getsignal( signal.SIGTERM ) self.original_signal_handlers[signal.SIGINT] = signal.getsignal( signal.SIGINT ) def start(self, run: Run): self.original_stdout = sys.stdout self.original_stderr = sys.stderr self.run = run self.log_file_path = ( ln_setup.settings.cache_dir / f"run_logs_{self.run.uid}.txt" ) self.log_file = open(self.log_file_path, "w", encoding="utf-8") # the instance that's connected is important information self.log_file.write( f"\x1b[92m→\x1b[0m connected lamindb: {ln_setup.settings.instance.slug}\n" ) # use buffering for correct handling of carriage returns sys.stdout = LogStreamHandler( self.original_stdout, self.log_file, use_buffer=True ) # write evrything immediately in stderr sys.stderr = LogStreamHandler( self.original_stderr, self.log_file, use_buffer=False ) # handle signals # signal should be used only in the main thread, otherwise # ValueError: signal only works in main thread of the main interpreter if threading.current_thread() == threading.main_thread(): signal.signal(signal.SIGTERM, self.cleanup) signal.signal(signal.SIGINT, self.cleanup) # handle exceptions sys.excepthook = self.handle_exception # reset handler for lamin logger because sys.stdout has been replaced logger.set_handler() def finish(self): if self.original_stdout: getattr(sys.stdout, "flush_buffer", sys.stdout.flush)() sys.stderr.flush() sys.stdout = self.original_stdout sys.stderr = self.original_stderr if not self.log_file.closed: self.log_file.close() # reset handler for lamin logger because sys.stdout has been replaced logger.set_handler() def cleanup(self, signo=None, frame=None): try: from .._finish import save_run_logs if self.original_stdout and not self.is_cleaning_up: self.is_cleaning_up = True if signo is not None: if self.log_file.closed: self.log_file = open(self.log_file_path, "a", encoding="utf-8") getattr(sys.stdout, "flush_buffer", sys.stdout.flush)() sys.stderr.flush() signal_msg = f"\nProcess terminated by signal {signo} ({signal.Signals(signo).name})\n" if frame: signal_msg += ( f"Frame info:\n{''.join(traceback.format_stack(frame))}" ) self.log_file.write(signal_msg) self.log_file.flush() self.run._status_code = 2 # aborted else: self.run._status_code = 1 # errored self.run.finished_at = datetime.now(timezone.utc) sys.stdout = self.original_stdout sys.stderr = self.original_stderr if not self.log_file.closed: self.log_file.close() save_run_logs(self.run, save_run=True) # reset handler for lamin logger because sys.stdout has been replaced logger.set_handler() except: # noqa: E722, S110 pass finally: if signo is not None and signo in self.original_signal_handlers: original_handler = self.original_signal_handlers[signo] if callable(original_handler): original_handler(signo, frame) def handle_exception(self, exc_type, exc_value, exc_traceback): try: if self.original_stdout and not self.is_cleaning_up: if self.log_file.closed: self.log_file = open(self.log_file_path, "a", encoding="utf-8") getattr(sys.stdout, "flush_buffer", sys.stdout.flush)() sys.stderr.flush() error_msg = f"{''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))}" self.log_file.write(error_msg) self.log_file.flush() self.cleanup() except: # noqa: E722, S110 pass finally: self.original_excepthook(exc_type, exc_value, exc_traceback) def serialize_params_to_json(params: dict) -> dict: serialized_params = {} for key, value in params.items(): # None and empty list are missing/empty values, skip them consistent with elsewhere in the code if value is None or (isinstance(value, list) and len(value) == 0): continue dtype, converted_value, _ = infer_convert_dtype_key_value(key, value, mute=True) # converted_value is not JSON if dtype is a SQLRecord or a list of SQLRecords # because we just the above function for features where we'd like to keep SQLRecords as they are # so, need to handle this here if ( dtype == "?" or dtype.startswith("cat") or dtype.startswith("list[cat") ) and dtype not in {"cat ? str", "list[cat ? str]"}: if isinstance(value, SQLRecord): serialized_params[key] = ( f"{value.__class__.__get_name_with_module__()}[{value.uid}]" ) elif dtype.startswith("list[cat"): items = list(value) if items and all(isinstance(item, SQLRecord) for item in items): serialized_params[key] = [ # type: ignore f"{item.__class__.__get_name_with_module__()}[{item.uid}]" for item in items ] else: serialized_params[key] = converted_value if key not in serialized_params: logger.warning( f"skipping param {key} with value {value} and dtype {dtype} not JSON serializable" ) continue if is_sensitive_param_key(key) or is_sensitive_param_value( serialized_params[key] ): serialized_params[key] = REDACTED_SECRET_VALUE return serialized_params class Context: """Run context. Is the book keeper for :func:`~lamindb.track` and :func:`~lamindb.finish`. """ def __init__(self, uid: str | None = None, path: Path | None = None): self._uid: str | None = uid self._path: Path | None = path self._description: str | None = None self._version: str | None = None self._transform: Transform | None = None self._run: Run | None = None self._project: Project | None = None self._space: Space | None = None self._branch: Branch | None = None self._logging_message_track: str = "" self._logging_message_imports: str = "" self._stream_tracker: LogStreamTracker = LogStreamTracker() self._is_finish_retry: bool = False self._notebook_runner: str | None = None self._is_step_decorator_run: bool = False @property def transform(self) -> Transform | None: """Managed transform of context.""" return self._transform @property def description(self) -> str | None: """`description` argument for `context.transform`.""" return self._description @description.setter def description(self, value: str | None): self._description = value @property def uid(self) -> str | None: """`uid` argument for `context.transform`.""" return self._uid @uid.setter def uid(self, value: str | None): self._uid = value @property def version(self) -> str | None: """`version` argument for `context.transform`.""" return self._version @version.setter def version(self, value: str | None): self._version = value @property def project(self) -> Project | None: """Project to label entities created during the run.""" return self._project @property def space(self) -> Space | None: """The space in which artifacts, collections, transforms, and runs are saved during the run.""" return self._space @property def branch(self) -> Branch | None: """The branch on which entities are created during the run.""" return self._branch @property def run(self) -> Run | None: """Managed run of context.""" return self._run def _track( self, transform: str | Transform | None = None, *, project: str | Project | None = None, space: str | Space | None = None, branch: str | Branch | None = None, plan: str | Artifact | None = None, features: dict | None = None, params: dict | None = None, new_run: bool | None = None, pypackages: bool | None = None, key: str | None = None, path: str | Path | None = None, source_code: str | None = None, kind: TransformKind | None = None, entrypoint: str | None = None, initiated_by_run: Run | str | None = None, stream_tracking: bool | None = None, ) -> None: """Track a run of a notebook or script. Populates the global run :class:`~lamindb.context` with :class:`~lamindb.Transform` & :class:`~lamindb.Run` objects and tracks the compute environment. Args: transform: A transform (stem) `uid` or object. If `None`, auto-creates a `transform` with its `uid`. project: A project or its `name` or `uid` for labeling entities created during the run. space: A restricted space or its `name` or `uid` in which to store entities created during the run. Default: the `"all"` space. Note that bionty entities ignore this setting and always get written to the `"all"` space. If you want to manually move entities to a different space, set the `.space` field (:doc:`docs:permissions`). branch: A branch (or its `name` or `uid`) on which to store records. plan: A plan, typically an agent plan. Pass an artifact (or its `key` or `uid`). features: A dictionary of features & values to track for the run. params: A dictionary of params & values to track for the run. new_run: If `False`, loads the latest run of transform (default notebook), if `True`, creates new run (default non-notebook). pypackages: If `True` or `None`, infers Python packages used in a notebook. key: Transform key. path: Filepath of a notebook or script. source_code: Source code. kind: Transform kind. entrypoint: Optional entrypoint name (e.g. function qualname) for the run. initiated_by_run: Optional parent run (or its `uid`) that triggered this run. If `None`, falls back to the `LAMIN_INITIATED_BY_RUN_UID` environment variable when set. stream_tracking: If set, override whether to capture stdout/stderr to run logs. Used by the flow/step decorator: flows get logs (`True`), steps do not (`False`). Examples: To track the run of a notebook or script: .. literalinclude:: scripts/run_track_and_finish.py :language: python To ensure one version history across file renames:: ln.track("Onv04I53OgtT") To track a project or an agent plan: pass a project/artifact to `ln.track()`, for example:: ln.track(project="My project", plan="./plans/curate-dataset-x.md") Note that you have to create a project or save the agent plan in case it they don't yet exist:: # create a project in Python ln.Project(name="My project").save() # create a project with the CLI lamin create project "My project" # save an agent plan with the CLI lamin save /path/to/.cursor/plans/curate-dataset-x.plan.md lamin save /path/to/.claude/plans/curate-dataset-x.md To sync code with a git repo, see: :ref:`sync-code-with-git`. To track parameters and features, see: :ref:`track-run-parameters`. To browse more examples, see: :doc:`/track`. """ from lamindb.models import Artifact, Branch, Project, Space from .._finish import ( save_context_core, ) # similar logic here: https://github.com/laminlabs/lamindb/pull/2527 if ln_setup.settings.instance.is_read_only_connection: logger.warning("skipping track(), connected in read-only mode") return None if project is None: project = os.environ.get("LAMIN_CURRENT_PROJECT") if project is not None: if isinstance(project, Project): assert project._state.adding is False, ( # noqa: S101 "Project must be saved before passing it to track()" ) project_record = project else: project_record = Project.filter( Q(name=project) | Q(uid=project) ).one_or_none() if project_record is None: raise InvalidArgument( f"Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos." ) self._project = project_record if space is not None: if isinstance(space, Space): assert space._state.adding is False, ( # noqa: S101 "Space must be saved before passing it to track()" ) space_record = space else: space_record = Space.filter(Q(name=space) | Q(uid=space)).one_or_none() if space_record is None: raise InvalidArgument( f"Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`." ) self._space = space_record if branch is not None: if isinstance(branch, Branch): assert branch._state.adding is False, ( # noqa: S101 "Branch must be saved before passing it to track()" ) branch_record = branch else: branch_record = Branch.filter( Q(name=branch) | Q(uid=branch) ).one_or_none() if branch_record is None: raise InvalidArgument( f"Space '{branch}', please check on the hub UI whether you have the correct `uid` or `name`." ) self._branch = branch_record plan_record: Artifact | None = None if plan is not None: if isinstance(plan, Artifact): assert plan._state.adding is False, ( # noqa: S101 "Plan artifact must be saved before passing it to track()" ) plan_record = plan else: plan_record = Artifact.filter(Q(key=plan) | Q(uid=plan)).one_or_none() if plan_record is None: raise InvalidArgument( f"Plan artifact '{plan}' not found, either create it or use a valid key/uid." ) if initiated_by_run is None: initiated_by_run = os.environ.get("LAMIN_INITIATED_BY_RUN_UID") initiated_by_run_record: Run | None = None if initiated_by_run is not None: if isinstance(initiated_by_run, Run): assert initiated_by_run._state.adding is False, ( # noqa: S101 "initiated_by_run must be saved before passing it to track()" ) initiated_by_run_record = initiated_by_run else: initiated_by_run_record = Run.filter(uid=initiated_by_run).one_or_none() if initiated_by_run_record is None: raise InvalidArgument( f"Run '{initiated_by_run}' not found, please pass a valid run uid." ) self._logging_message_track = "" self._logging_message_imports = "" self._is_step_decorator_run = ( entrypoint is not None and stream_tracking is False ) if transform is not None and isinstance(transform, str): self.uid = transform transform = None uid_was_none = False else: uid_was_none = True self._path = None cli_call = get_cli_call() if transform is None: description = None transform_ref = None transform_ref_type = None if source_code is not None: transform_kind = kind if kind is not None else "function" assert key is not None, ( "`key` cannot be `None` when `source_code` is passed to `track()`." ) assert path is None, ( "`path` cannot be passed when `source_code` is passed to `track()`." ) else: if is_run_from_ipython: self._path, description = self._track_notebook( path_str=path, pypackages=pypackages ) transform_kind = "notebook" else: ( self._path, transform_kind, transform_ref, transform_ref_type, key_from_module, ) = detect_and_process_source_code_file(path=path) if key is None and key_from_module is not None: key = key_from_module if description is None: description = self._description if description is None and cli_call is not None: description = f"CLI: {cli_call[0]}" self._create_or_load_transform( description=description, transform_ref=transform_ref, transform_ref_type=transform_ref_type, transform_kind=transform_kind, key=key, source_code=source_code, ) else: if transform.kind in {"notebook", "script"}: raise ValueError( "Use `ln.track()` without passing transform in a notebook or script" " - metadata is automatically parsed" ) transform_exists = None if transform.id is not None: # transform has an id but unclear whether already saved transform_exists = Transform.filter(id=transform.id).first() if transform_exists is None: transform.save() self._logging_message_track += ( f"created Transform('{transform.uid}', key='{transform.key}')" ) transform_exists = transform else: self._logging_message_track += ( f"loaded Transform('{transform.uid}', key='{transform.key}')" ) self._transform = transform_exists if new_run is None: # for notebooks, default to loading latest runs new_run = ( False if ( self._transform.kind == "notebook" and self._notebook_runner != "nbconvert" ) else True ) # type: ignore run = None if not new_run: # try loading latest run by same user run = ( Run.filter( transform=self._transform, created_by_id=ln_setup.settings.user.id ) .order_by("-created_at") .first() ) if run is not None: # loaded latest run run.started_at = datetime.now(timezone.utc) # update run time run._status_code = -2 # re-started if plan_record is not None: run.plan = plan_record run.save() entrypoint_str = ( f", entrypoint='{entrypoint}'" if entrypoint is not None else "" ) self._logging_message_track += f", re-started Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}" if run is None: # create new run run = Run(transform=self._transform, plan=plan_record) if entrypoint is not None: run.entrypoint = entrypoint if initiated_by_run_record is not None: run.initiated_by_run = initiated_by_run_record run.started_at = datetime.now(timezone.utc) run._status_code = -1 # started entrypoint_str = ( f", entrypoint='{entrypoint}'" if entrypoint is not None else "" ) self._logging_message_track += f", started new Run('{run.uid}'{entrypoint_str}) at {format_field_value(run.started_at)}" # can only determine at ln.finish() if run was consecutive in # interactive session, otherwise, is consecutive run.is_consecutive = True if is_run_from_ipython else None if params is not None: run.params = serialize_params_to_json(params) self._logging_message_track += "\n→ params: " + ", ".join( f"{key}={value!r}" for key, value in run.params.items() ) if cli_call is not None: _, cli_args = cli_call logger.important(f"script invoked with: {cli_args}") run.cli_args = cli_args run.save() # need to save now if features is not None: run.features.add_values(features) self._logging_message_track += "\n→ features: " + ", ".join( f"{key}={value!r}" for key, value in features.items() ) self._run = run track_python_environment(run) if self.project is not None: # to update a potential project link # is only necessary if transform is loaded rather than newly created # can be optimized by checking whether the transform is loaded, but it typically is self.transform.save() log_to_file = None if log_to_file is None: if stream_tracking is not None: log_to_file = stream_tracking else: # Script runs get stream tracking; decorator-based runs only when # stream_tracking is passed (flow=True from decorator). log_to_file = self.transform.kind == "script" if log_to_file: self._stream_tracker.start(run) logger.important(self._logging_message_track) if self._logging_message_imports: logger.important(self._logging_message_imports) if uid_was_none and self._path is not None: # Flow/step decorators set run.entrypoint. Show this recommendation only # for flows (`stream_tracking=True`) and suppress it for steps. if entrypoint is not None: if stream_tracking: logger.important_hint( f'recommendation: to identify the script across renames, pass the uid: @ln.flow(uid="{self.transform.uid[:-4]}")' ) else: notebook_or_script = ( "notebook" if self._transform.kind == "notebook" else "script" ) r_or_python = "." if self._path.suffix in {".py", ".ipynb"} else "$" project_str = ( f', project="{project if isinstance(project, str) else project.name}"' if project is not None else "" ) space_str = ( f', space="{space if isinstance(space, str) else space.name}"' if space is not None else "" ) plan_str = ( f', plan="{plan if isinstance(plan, str) else plan.key}"' if plan is not None else "" ) params_str = ( ", params={...}" if params is not None else "" ) # do not put the values because typically parameterized by user kwargs_str = f"{project_str}{space_str}{plan_str}{params_str}" logger.important_hint( f'recommendation: to identify the {notebook_or_script} across renames, pass the uid: ln{r_or_python}track("{self.transform.uid[:-4]}"{kwargs_str})' ) if ( self.transform.kind == "script" and self._path is not None and not self._is_step_decorator_run ): save_context_core( run=run, transform=self.transform, filepath=self._path, message_prefix="monitor at", ) def _track_notebook( self, *, path_str: str | Path | None, pypackages: bool | None = None, ) -> tuple[Path, str | None]: if path_str is None: path, self._notebook_runner = get_notebook_path() else: path = Path(path_str) if pypackages is None: pypackages = True description = None if path.suffix == ".ipynb" and path.stem.startswith("Untitled"): raise RuntimeError( "Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel." ) path_str = path.as_posix() if path_str.startswith("/fileId="): logger.warning("tracking on Google Colab is experimental") path_str = get_notebook_key_colab() path = Path(path_str) else: from nbproject.dev import read_notebook from nbproject.dev._meta_live import get_title from nbproject.dev._pypackage import infer_pypackages try: nb = read_notebook(path_str) nbproject_title = get_title(nb) if nbproject_title is not None: description = nbproject_title if pypackages: self._logging_message_imports += ( "notebook imports:" f" {pretty_pypackages(infer_pypackages(nb, pin_versions=True))}" ) except Exception: logger.debug("reading the notebook file failed") pass return path, description def _process_aux_transform( self, aux_transform: Transform, transform_hash: str, ) -> tuple[str, Transform | None, str]: # first part of the if condition: no version bump, second part: version bump message = "" if ( # if a user hasn't yet saved the transform source code AND is the same user ( aux_transform.source_code is None and aux_transform.created_by_id == ln_setup.settings.user.id ) # if the transform source code is unchanged # if aux_transform.kind == "notebook", we anticipate the user makes changes to the notebook source code # in an interactive session, hence we *pro-actively bump* the version number by setting `revises` / 'nbconvert' execution is NOT interactive # in the second part of the if condition even though the source code is unchanged at point of running track() or ( aux_transform.hash == transform_hash and ( aux_transform.kind != "notebook" or self._notebook_runner == "nbconvert" ) ) ): uid = aux_transform.uid return uid, aux_transform, message else: uid = f"{aux_transform.uid[:-4]}{increment_base62(aux_transform.uid[-4:])}" message = ( f"found {aux_transform.kind} {aux_transform.key}, making new version" ) if ( aux_transform.hash == transform_hash and aux_transform.kind == "notebook" ): message += " -- anticipating changes" elif aux_transform.hash != transform_hash: message += ( "" # could log "source code changed", but this seems too much ) elif aux_transform.created_by_id != ln_setup.settings.user.id: message += ( f" -- {aux_transform.created_by.handle} already works on this draft" ) return uid, None, message def _create_or_load_transform( self, *, description: str | None = None, transform_ref: str | None = None, transform_ref_type: str | None = None, transform_kind: TransformKind = None, key: str | None = None, source_code: str | None = None, ): source_code_to_store = source_code if source_code is not None: source_code_to_store, redaction_count = redact_secrets_in_source_code( source_code ) if redaction_count > 0: logger.warning( f"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code" ) transform_hash = hash_string(source_code) else: from .._finish import notebook_to_script if not self._path.suffix == ".ipynb": _, transform_hash, _ = hash_file(self._path) else: # need to convert to stripped py:percent format for hashing source_code_path = ( ln_setup.settings.cache_dir / self._path.name.replace(".ipynb", ".py") ) if ( self._path.exists() ): # notebook kernel might be running on a different machine notebook_to_script(description, self._path, source_code_path) _, transform_hash, _ = hash_file(source_code_path) else: logger.debug( "skipping notebook hash comparison, notebook kernel running on a different machine" ) transform_hash = None # see whether we find a transform with the exact same hash if transform_hash is not None: aux_transform = Transform.filter(hash=transform_hash).first() else: aux_transform = None # determine the transform key (only when path-based; key is required when source_code) if key is None: if ln_setup.settings.dev_dir is not None: try: key = self._path.relative_to(ln_setup.settings.dev_dir).as_posix() except ValueError as e: if "subpath" in str(e): logger.warning( f"Path {self._path} is not within the configured dev directory " f"({ln_setup.settings.dev_dir}), falling back to using filename as transform key " f"('{self._path.name}')." ) key = self._path.name else: raise else: key = self._path.name # if the user did not pass a uid and there is no matching aux_transform # need to search for the transform based on the key if self.uid is None and aux_transform is None: class SlashCount(Func): template = "LENGTH(%(expressions)s) - LENGTH(REPLACE(%(expressions)s, '/', ''))" output_field = IntegerField() # we need to traverse from greater depth to shorter depth so that we match better matches first transforms = ( Transform.filter(key__endswith=key, is_latest=True) .annotate(slash_count=SlashCount("key")) .order_by("-slash_count") ) uid = f"{base62_12()}0000" target_transform = None if len(transforms) != 0: message = "" found_key = False if self._path is not None: for aux_transform in transforms: # check whether the transform key is in the path # that's not going to be the case for keys that have "/" in them and don't match the folder if aux_transform.key in self._path.as_posix(): key = aux_transform.key uid, target_transform, message = ( self._process_aux_transform( aux_transform, transform_hash ) ) found_key = True break if not found_key: plural_s = "s" if len(transforms) > 1 else "" transforms_str = "\n".join( [ f" {transform.uid} → {transform.key}" for transform in transforms ] ) message = f"ignoring transform{plural_s} with same filename in different folder:\n{transforms_str}" if message != "": logger.important(message) self.uid, transform = uid, target_transform # the user did pass the uid elif self.uid is not None and len(self.uid) == 16: transform = Transform.filter(uid=self.uid).one_or_none() else: if self.uid is not None: # the case with length 16 is covered above if not len(self.uid) == 12: raise InvalidArgument( f'Please pass an auto-generated uid instead of "{self.uid}". Resolve by running: ln.track("{base62_12()}")' ) aux_transform = ( Transform.filter(uid__startswith=self.uid) .order_by("-created_at") .first() ) else: # deal with a hash-based match # the user might have a made a copy of the notebook or script # and actually wants to create a new transform if aux_transform is not None and not aux_transform.key.endswith(key): prompt = f"Found transform with same hash but different key: {aux_transform.key}. Did you rename your {transform_kind} to {key} (1) or intentionally made a copy (2)?" response = ( "1" if os.getenv("LAMIN_TESTING") == "true" else input(prompt) ) assert response in {"1", "2"}, ( # noqa: S101 f"Please respond with either 1 or 2, not {response}" ) if response == "2": aux_transform, transform_hash = ( None, None, ) # make a new transform if aux_transform is not None: uid, target_transform, message = self._process_aux_transform( aux_transform, transform_hash ) if message != "": logger.important(message) else: uid = f"{self.uid}0000" if self.uid is not None else None target_transform = None self.uid, transform = uid, target_transform if self.version is not None: # test inconsistent version passed if ( transform is not None and transform.version_tag is not None # type: ignore and self.version != transform.version_tag # type: ignore ): raise ValueError( f"Transform is already tagged with version {transform.version_tag}, but you passed {self.version}\n" # noqa: S608 f"If you want to update the transform version, set it outside ln.track(): transform.version_tag = '{self.version}'; transform.save()" ) # test whether version was already used for another member of the family if self.uid is not None and len(self.uid) == 16: suid, vuid = (self.uid[:-4], self.uid[-4:]) transform = Transform.filter( uid__startswith=suid, version_tag=self.version ).one_or_none() if transform is not None and vuid != transform.uid[-4:]: better_version = bump_version_function(self.version) raise SystemExit( f"✗ version '{self.version}' is already taken by Transform('{transform.uid}'); please set another version, e.g., ln.context.version = '{better_version}'" ) # make a new transform record if transform is None: assert key is not None # noqa: S101 transform = Transform( # type: ignore uid=self.uid, version_tag=self.version, description=description, key=key, reference=transform_ref, reference_type=transform_ref_type, kind=transform_kind, source_code=source_code_to_store, skip_hash_lookup=source_code is not None, ) if source_code is not None: transform.hash = transform_hash transform = transform.save() self._logging_message_track += ( f"created Transform('{transform.uid}', key='{transform.key}')" ) else: uid = transform.uid # transform was already saved via `finish()` transform_was_saved = transform.source_code is not None # check whether the transform.key is consistent if transform.key != key: self._logging_message_track += ( f"renaming transform {transform.key} to {key}" ) transform.key = key transform.save() elif transform.description != description and description is not None: transform.description = description transform.save() self._logging_message_track += ( "updated transform description, " # white space on purpose ) elif ( transform.created_by_id != ln_setup.settings.user.id and not transform_was_saved ): raise UpdateContext( f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.kind}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* key and `ln.track("{base62_12()}0000")`.' ) if transform.reference != transform_ref: transform.reference = transform_ref transform.reference_type = transform_ref_type transform.save() self._logging_message_track += ( "updated transform reference, " # white space on purpose ) # check whether transform source code was already saved if transform_was_saved: bump_revision = False if ( transform.kind == "notebook" and self._notebook_runner != "nbconvert" ): # we anticipate the user makes changes to the notebook source code # in an interactive session, hence we pro-actively bump the version number bump_revision = True else: if transform_hash != transform.hash: bump_revision = True else: self._logging_message_track += f"loaded Transform('{transform.uid}', key='{transform.key}')" if bump_revision: change_type = ( "re-running notebook with already-saved source code" if ( transform.kind == "notebook" and self._notebook_runner != "nbconvert" ) else "source code changed" ) raise UpdateContext( f'✗ {change_type}, please update the `uid` argument in `track()` to "{uid[:-4]}{increment_base62(uid[-4:])}"' ) else: self._logging_message_track += ( f"loaded Transform('{transform.uid}', key='{transform.key}')" ) self._transform = transform def _finish(self, ignore_non_consecutive: None | bool = None) -> None: """Finish the run of a notebook or script. - writes a timestamp: `run.finished_at` - saves the source code if it is not yet saved: `transform.source_code` - saves a run report: `run.report` When called in a notebook, will prompt to save the notebook in your editor. Args: ignore_non_consecutive: Whether to ignore if a notebook was non-consecutively executed. Examples: See :doc:`/track`. See Also: `lamin save script.py` or `lamin save notebook.ipynb` → `docs `__ """ from .._finish import save_context_core, save_run_logs if self.run is None: raise TrackNotCalled("Please run `ln.track()` before `ln.finish()`") if self._path is None: if self.run.transform.kind in {"script", "notebook"}: raise ValueError( "Transform type is not allowed to be 'script' or 'notebook' because `context._path` is `None`." ) self.run.finished_at = datetime.now(timezone.utc) self.run.save() # reset context so the next _track() starts clean (e.g. from decorator) self._uid = None self._run = None self._transform = None self._version = None self._description = None self._is_step_decorator_run = False return None self.run._status_code = 0 if self.transform.kind == "notebook": return_code = save_context_core( run=self.run, transform=self.run.transform, filepath=self._path, finished_at=True, ignore_non_consecutive=ignore_non_consecutive, is_retry=self._is_finish_retry, notebook_runner=self._notebook_runner, ) if return_code == "retry": self._is_finish_retry = True return None else: self.run.finished_at = datetime.now(timezone.utc) self.run.save() # persist finished_at (save_run_logs only saves when log file exists) if ln_setup.settings.instance.is_on_hub and not self._is_step_decorator_run: instance_slug = ln_setup.settings.instance.slug ui_url = ln_setup.settings.instance.ui_url logger.important( f"go to: {ui_url}/{instance_slug}/transform/{self.transform.uid}" ) save_run_logs(self.run, save_run=True) self._stream_tracker.finish() # reset the context attributes so that somebody who runs `track()` after finish # starts fresh self._uid = None self._run = None self._transform = None self._version = None self._description = None self._is_step_decorator_run = False context: Context = Context() ================================================ FILE: lamindb/core/_functions.py ================================================ import functools import inspect from contextvars import ContextVar from datetime import datetime, timezone from pathlib import Path from typing import Callable, Literal, ParamSpec, TypeVar from lamindb.base import deprecated from ..models import Run from ._context import Context, get_key_from_module from ._context import context as global_context P = ParamSpec("P") R = TypeVar("R") # Create a context variable to store the current tracked run current_tracked_run: ContextVar[Run | None] = ContextVar( "current_tracked_run", default=None ) def get_current_tracked_run() -> Run | None: """Get the run object.""" run = current_tracked_run.get() if run is None: run = global_context.run return run def _create_tracked_decorator( uid: str | None = None, is_flow: bool = True, global_run: Literal["memorize", "clear", "none"] = "none", track_arg_aliases: bool = False, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Internal helper to create tracked decorators. Args: uid: Persist the uid to identify this transform across renames. is_flow: Triggered through @ln.flow(), otherwise @ln.step(). """ def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]: # Get the original signature sig = inspect.signature(func) @functools.wraps(func) def wrapper_tracked(*args: P.args, **kwargs: P.kwargs) -> R: if global_context.run is None: if not is_flow: raise RuntimeError( "Please track the global run context before using @ln.step(): ln.track() or @ln.flow()" ) else: if is_flow: raise RuntimeError( "Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`" ) bound_args = sig.bind(*args, **kwargs) bound_args.apply_defaults() params = dict(bound_args.arguments) initiated_by_run = get_current_tracked_run() track_kwargs: dict = {} if track_arg_aliases: for key in ("project", "space", "branch", "plan", "initiated_by_run"): if key in params and params[key] is not None: track_kwargs[key] = params[key] if "initiated_by_run" in track_kwargs: initiated_by_run = track_kwargs["initiated_by_run"] path_raw = inspect.getsourcefile(func) path = None # do not pass path when function is defined in an ipython cell if path_raw is not None and Path(path_raw).exists(): path = Path(path_raw) source_code = inspect.getsource(func) if path is None else None transform_kind: Literal["function", "script"] = ( "function" if path is None else "script" ) caller_module = func.__module__ key = get_key_from_module(caller_module) if ( key is None and path is None and caller_module in {"__main__", "__mp_main__"} ): key = f"{initiated_by_run.transform.key}" context = Context(uid=uid, path=path) context._track( uid, path=path, key=key, source_code=source_code, kind=transform_kind, entrypoint=func.__qualname__, params=params, new_run=True, project=track_kwargs.get("project"), space=track_kwargs.get("space"), branch=track_kwargs.get("branch"), plan=track_kwargs.get("plan"), initiated_by_run=initiated_by_run, stream_tracking=is_flow, ) token = current_tracked_run.set(context.run) if global_run in {"memorize", "clear"}: global_context._run = context.run try: result = func(*args, **kwargs) context._finish() return result except Exception as e: run = context.run run.finished_at = datetime.now(timezone.utc) run._status_code = 1 # errored run.save() raise e finally: if ( global_run == "clear" and global_context.run == current_tracked_run.get() ): global_context._run = None current_tracked_run.reset(token) return wrapper_tracked return decorator_tracked def flow( uid: str | None = None, global_run: Literal["memorize", "clear", "none"] = "clear", track_arg_aliases: bool = True, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Use `@flow()` to track a function as a workflow. You will be able to see inputs, outputs, and parameters of the function in the data lineage graph. The decorator creates a :class:`~lamindb.Transform` with kind `"script"` that maps onto the file in which the function is defined. The function maps onto an entrypoint of the `transform`. A function execution creates a :class:`~lamindb.Run` object that stores the function name in `run.entrypoint`. If the function is defined in a notebook cell or another ephemeral context, the transform is created with kind `"function"`. By default `@ln.flow()`, like `ln.track()`, creates a global run context that can be accessed with `ln.context.run`. Args: uid: Persist the uid to identify a transform across renames. global_run: If `"clear"`, set the global run context `ln.context.run` and clear after the function completes. If `"memorize"`, set the global run context and do not clear after the function completes. Set this to `"none"` if you want to track concurrent executions of a `flow()` in the same Python process. track_arg_aliases: If `True` (default), maps function arguments with names `project`, `space`, `branch`, `plan`, and `initiated_by_run` to matching `ln.track()` arguments while also keeping them in `run.params` for reproducibility. Pass `False` to disable this mapping. Examples: To sync a workflow with a file in a git repo, see: :ref:`sync-code-with-git`. For an extensive guide, see: :ref:`manage-workflows`. Here follow some examples. .. literalinclude:: scripts/my_workflow.py :language: python :caption: my_workflow.py .. literalinclude:: scripts/my_workflow_with_step.py :language: python :caption: my_workflow_with_step.py .. literalinclude:: scripts/my_workflow_with_click.py :language: python :caption: my_workflow_with_click.py """ return _create_tracked_decorator( uid=uid, is_flow=True, global_run=global_run, track_arg_aliases=track_arg_aliases, ) def step(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]: """Use `@step()` to track a function as a step. Behaves like :func:`~lamindb.flow()`, but acts as a step in a workflow and does not create a global run context. It errors if no initiating run (either global or local run context) exists. See :func:`~lamindb.flow()` for examples. Args: uid: Persist the uid to identify a transform across renames. """ return _create_tracked_decorator(uid=uid, is_flow=False) @deprecated("step") def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]: return step(uid) ================================================ FILE: lamindb/core/_mapped_collection.py ================================================ from __future__ import annotations from collections import Counter from functools import reduce from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd from lamin_utils import logger from lamindb_setup.core.upath import UPath from .storage._anndata_accessor import ( ArrayType, ArrayTypes, GroupType, GroupTypes, StorageType, _safer_read_index, get_spec, registry, ) if TYPE_CHECKING: from lamindb_setup.types import AnyPathStr class _Connect: def __init__(self, storage): if isinstance(storage, UPath): # force no external compression even for files with .gz extension. REMOVE LATER self.conn, self.store = registry.open("h5py", storage, compression=None) self.to_close = True else: self.conn, self.store = None, storage self.to_close = False def __enter__(self): return self.store def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): if not self.to_close: return if hasattr(self.store, "close"): self.store.close() if hasattr(self.conn, "close"): self.conn.close() _decode = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1) class MappedCollection: """Map-style collection for use in data loaders. This class virtually concatenates `AnnData` arrays as a `pytorch map-style dataset `__. If your `AnnData` collection is in the cloud, move them into a local cache first for faster access. `__getitem__` of the `MappedCollection` object takes a single integer index and returns a dictionary with the observation data sample for this index from the `AnnData` objects in `path_list`. The dictionary has keys for `layers_keys` (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"` for the index of the `AnnData` object containing this observation sample. .. note:: For a guide, see :doc:`docs:scrna-mappedcollection`. For more convenient use within :class:`~lamindb.core.MappedCollection`, see :meth:`~lamindb.Collection.mapped`. This currently only works for collections of `AnnData` objects. The implementation was influenced by the `SCimilarity `__ data loader. Args: path_list: A list of paths to `AnnData` objects stored in `.h5ad` or `.zarr` formats. layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list retrieves ``.X``. ``"raw.X"`` retrieves ``.X`` from ``.raw`` slot. Keys not present in an object are omitted from the output for that object. obsm_keys: Keys from the ``.obsm`` slots. Keys not present in an object are omitted from the output for that object. obs_keys: Keys from the ``.obs`` slots. Keys not present in an object are omitted from the output for that object. obs_filter: Select only observations with these values for the given obs columns. Should be a dictionary with obs column names as keys and filtering values (a string or a list of strings) as values. join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed, does not join. The join is applied to ``layers_keys`` except for ``"raw.X"``. encode_labels: Encode labels into integers. Can be a list with elements from ``obs_keys``. unknown_label: Encode this label to -1. Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True`` or from ``encode_labels`` if it is a list. cache_categories: Enable caching categories of ``obs_keys`` for faster access. parallel: Enable sampling with multiple processes. dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm`` """ def __init__( self, path_list: list[AnyPathStr], layers_keys: str | list[str] | None = None, obs_keys: str | list[str] | None = None, obsm_keys: str | list[str] | None = None, obs_filter: dict[str, str | list[str]] | None = None, join: Literal["inner", "outer"] | None = "inner", encode_labels: bool | list[str] = True, unknown_label: str | dict[str, str] | None = None, cache_categories: bool = True, parallel: bool = False, dtype: str | None = None, ): if join not in {None, "inner", "outer"}: # pragma: nocover raise ValueError( f"join must be one of None, 'inner, or 'outer' but was {type(join)}" ) self.filtered = obs_filter is not None if self.filtered and not isinstance(obs_filter, dict): logger.warning( "Passing a tuple to `obs_filter` is deprecated, use a dictionary" ) obs_filter = {obs_filter[0]: obs_filter[1]} if layers_keys is None: self.layers_keys = ["X"] else: self.layers_keys = ( [layers_keys] if isinstance(layers_keys, str) else layers_keys ) obsm_keys = [obsm_keys] if isinstance(obsm_keys, str) else obsm_keys self.obsm_keys = obsm_keys obs_keys = [obs_keys] if isinstance(obs_keys, str) else obs_keys self.obs_keys = obs_keys if isinstance(encode_labels, list): if len(encode_labels) == 0: encode_labels = False elif obs_keys is None or not all( enc_label in obs_keys for enc_label in encode_labels ): raise ValueError( "All elements of `encode_labels` should be in `obs_keys`." ) else: if encode_labels: encode_labels = obs_keys if obs_keys is not None else False self.encode_labels = encode_labels if encode_labels and isinstance(unknown_label, dict): if not all(unkey in encode_labels for unkey in unknown_label): # type: ignore raise ValueError( "All keys of `unknown_label` should be in `encode_labels` and `obs_keys`." ) self.unknown_label = unknown_label self.storages = [] # type: ignore self.conns = [] # type: ignore self.parallel = parallel self.path_list = path_list self._make_connections(path_list, parallel) self._cache_has_raw: list[bool] = [] self._cache_obsm_keys: list[set[str]] = [] self._cache_obs_keys: list[set[str]] = [] self._cache_layers_keys: list[set[str]] = [] self._cache_keys() self._cache_cats: dict = {} if self.obs_keys is not None: if cache_categories: self._cache_categories(self.obs_keys) self.encoders: dict = {} if self.encode_labels: self._make_encoders(self.encode_labels) # type: ignore self.n_obs_list = [] self.indices_list = [] for i, storage in enumerate(self.storages): with _Connect(storage) as store: X = store["X"] store_path = self.path_list[i] self._check_csc_raise_error(X, "X", store_path) if isinstance(X, ArrayTypes): # type: ignore n_obs_storage = X.shape[0] else: n_obs_storage = X.attrs["shape"][0] if self.filtered: indices_storage_mask = None for obs_filter_key, obs_filter_values in obs_filter.items(): if isinstance(obs_filter_values, tuple): obs_filter_values = list(obs_filter_values) elif not isinstance(obs_filter_values, list): obs_filter_values = [obs_filter_values] if obs_filter_key in store["obs"]: obs_labels = self._get_labels(store, obs_filter_key) obs_filter_mask = np.isin(obs_labels, obs_filter_values) else: obs_filter_mask = np.full(n_obs_storage, False) if pd.isna(obs_filter_values).any(): obs_filter_mask |= pd.isna(obs_labels) if indices_storage_mask is None: indices_storage_mask = obs_filter_mask else: indices_storage_mask &= obs_filter_mask indices_storage = np.where(indices_storage_mask)[0] n_obs_storage = len(indices_storage) else: indices_storage = np.arange(n_obs_storage) self.n_obs_list.append(n_obs_storage) self.indices_list.append(indices_storage) for layer_key in self.layers_keys: if layer_key == "X": continue lazy_data = self._get_lazy_data(store, layer_key, i) if lazy_data is None: continue self._check_csc_raise_error( lazy_data, "raw.X" if layer_key == "raw.X" else f"layers/{layer_key}", store_path, ) if self.obsm_keys is not None: for obsm_key in self.obsm_keys: if obsm_key in self._cache_obsm_keys[i]: self._check_csc_raise_error( store["obsm"][obsm_key], f"obsm/{obsm_key}", store_path, ) self.n_obs = sum(self.n_obs_list) self.indices = np.hstack(self.indices_list) self.storage_idx = np.repeat(np.arange(len(self.storages)), self.n_obs_list) self.join_vars: Literal["inner", "outer"] | None = join self.var_indices: list | None = None self.var_joint: pd.Index | None = None self.n_vars_list: list | None = None self.var_list: list | None = None self.n_vars: int | None = None if self.join_vars is not None: self._make_join_vars() self.n_vars = len(self.var_joint) self._dtype = dtype self._closed = False def _make_connections(self, path_list: list, parallel: bool): for path in path_list: path = UPath(path) if path.exists() and path.is_file(): # type: ignore if parallel: conn, storage = None, path else: # force no external compression even for files with .gz extension. REMOVE LATER conn, storage = registry.open("h5py", path, compression=None) else: conn, storage = registry.open("zarr", path) self.conns.append(conn) self.storages.append(storage) def _cache_keys(self): for storage in self.storages: with _Connect(storage) as store: store_keys = registry.keys(store) self._cache_has_raw.append("raw" in store_keys) for group in ("obsm", "obs", "layers"): cache = getattr(self, f"_cache_{group}_keys") cache.append( set(store_keys[group]) if group in store_keys else set() ) def _cache_categories(self, obs_keys: list): self._cache_cats = {} for label in obs_keys: self._cache_cats[label] = [] for i, storage in enumerate(self.storages): if label not in self._cache_obs_keys[i]: self._cache_cats[label].append(None) continue with _Connect(storage) as store: cats = self._get_categories(store, label) if cats is not None: cats = ( _decode(cats) if isinstance(cats[0], bytes) else cats[...] ) self._cache_cats[label].append(cats) def _make_encoders(self, encode_labels: list): for label in encode_labels: cats = self.get_merged_categories(label) encoder = {} if isinstance(self.unknown_label, dict): unknown_label = self.unknown_label.get(label, None) else: unknown_label = self.unknown_label if unknown_label is not None and unknown_label in cats: cats.remove(unknown_label) encoder[unknown_label] = -1 encoder.update({cat: i for i, cat in enumerate(cats)}) self.encoders[label] = encoder def _read_vars(self): self.var_list = [] self.n_vars_list = [] for storage in self.storages: with _Connect(storage) as store: vars = _safer_read_index(store["var"]) self.var_list.append(vars) self.n_vars_list.append(len(vars)) def _make_join_vars(self): if self.var_list is None: self._read_vars() vars_eq = all(self.var_list[0].equals(vrs) for vrs in self.var_list[1:]) if vars_eq: self.join_vars = None self.var_joint = self.var_list[0] return if self.join_vars == "inner": self.var_joint = reduce(pd.Index.intersection, self.var_list) if len(self.var_joint) == 0: raise ValueError( "The provided AnnData objects don't have shared variables.\n" "Use join='outer'." ) self.var_indices = [ vrs.get_indexer(self.var_joint) for vrs in self.var_list ] elif self.join_vars == "outer": self.var_joint = reduce(pd.Index.union, self.var_list) self.var_indices = [ self.var_joint.get_indexer(vrs) for vrs in self.var_list ] def check_vars_sorted(self, ascending: bool = True) -> bool: """Returns `True` if all variables are sorted in all objects.""" if self.var_list is None: self._read_vars() if ascending: vrs_sort_status = (vrs.is_monotonic_increasing for vrs in self.var_list) else: vrs_sort_status = (vrs.is_monotonic_decreasing for vrs in self.var_list) return all(vrs_sort_status) def check_vars_non_aligned(self, vars: pd.Index | list) -> list[int]: """Returns indices of objects with non-aligned variables. Args: vars: Check alignment against these variables. """ if self.var_list is None: self._read_vars() vars = pd.Index(vars) return [i for i, vrs in enumerate(self.var_list) if not vrs.equals(vars)] def _check_csc_raise_error( self, elem: GroupType | ArrayType, key: str, path: AnyPathStr ): if isinstance(elem, ArrayTypes): # type: ignore return if get_spec(elem).encoding_type == "csc_matrix": if not self.parallel: self.close() raise ValueError( f"{key} in {path} is a csc matrix, `MappedCollection` doesn't support this format yet." ) def __len__(self): return self.n_obs @property def shape(self) -> tuple[int, int]: """Shape of the (virtually aligned) dataset.""" return (self.n_obs, self.n_vars) @property def original_shapes(self) -> list[tuple[int, int]]: """Shapes of the underlying AnnData objects (with `obs_filter` applied).""" if self.n_vars_list is None: n_vars_list = [None] * len(self.n_obs_list) else: n_vars_list = self.n_vars_list return list(zip(self.n_obs_list, n_vars_list)) def __getitem__(self, idx: int): obs_idx = self.indices[idx] storage_idx = self.storage_idx[idx] if self.var_indices is not None: var_idxs_join = self.var_indices[storage_idx] else: var_idxs_join = None out = {"_store_idx": storage_idx} with _Connect(self.storages[storage_idx]) as store: for layers_key in self.layers_keys: lazy_data = self._get_lazy_data(store, layers_key, storage_idx) if lazy_data is None: continue # do not apply join to raw.X, return as is join_vars = None if layers_key == "raw.X" else self.join_vars out[layers_key] = self._get_data_idx( lazy_data, obs_idx, join_vars, var_idxs_join, self.n_vars ) if self.obsm_keys is not None: for obsm_key in self.obsm_keys: if obsm_key not in self._cache_obsm_keys[storage_idx]: continue lazy_data = store["obsm"][obsm_key] out[f"obsm_{obsm_key}"] = self._get_data_idx(lazy_data, obs_idx) if self.obs_keys is not None: for label in self.obs_keys: if label not in self._cache_obs_keys[storage_idx]: continue if label in self._cache_cats: cats = self._cache_cats[label][storage_idx] if cats is None: cats = [] else: cats = None label_idx = self._get_obs_idx(store, obs_idx, label, cats) if label in self.encoders and label_idx is not np.nan: label_idx = self.encoders[label][label_idx] out[label] = label_idx return out def _get_lazy_data(self, store: StorageType, layers_key: str, storage_idx: int): if layers_key == "X": lazy_data = store["X"] # type: ignore elif layers_key == "raw.X" and self._cache_has_raw[storage_idx]: lazy_data = store["raw"]["X"] # type: ignore elif layers_key in self._cache_layers_keys[storage_idx]: lazy_data = store["layers"][layers_key] # type: ignore else: lazy_data = None return lazy_data def _get_data_idx( self, lazy_data: ArrayType | GroupType, idx: int, join_vars: Literal["inner", "outer"] | None = None, var_idxs_join: list | None = None, n_vars_out: int | None = None, ): """Get the index for the data.""" if isinstance(lazy_data, ArrayTypes): # type: ignore lazy_data_idx = lazy_data[idx] # type: ignore if join_vars is None: result = lazy_data_idx if self._dtype is not None: result = result.astype(self._dtype, copy=False) elif join_vars == "outer": dtype = lazy_data_idx.dtype if self._dtype is None else self._dtype result = np.zeros(n_vars_out, dtype=dtype) result[var_idxs_join] = lazy_data_idx else: # inner join result = lazy_data_idx[var_idxs_join] if self._dtype is not None: result = result.astype(self._dtype, copy=False) return result else: # assume csr_matrix here data = lazy_data["data"] # type: ignore indices = lazy_data["indices"] # type: ignore indptr = lazy_data["indptr"] # type: ignore s = slice(*(indptr[idx : idx + 2])) data_s = data[s] dtype = data_s.dtype if self._dtype is None else self._dtype if join_vars == "outer": lazy_data_idx = np.zeros(n_vars_out, dtype=dtype) lazy_data_idx[var_idxs_join[indices[s]]] = data_s else: lazy_data_idx = np.zeros(lazy_data.attrs["shape"][1], dtype=dtype) # type: ignore lazy_data_idx[indices[s]] = data_s if join_vars == "inner": lazy_data_idx = lazy_data_idx[var_idxs_join] return lazy_data_idx def _get_obs_idx( self, storage: StorageType, idx: int, label_key: str, categories: list | None = None, ): """Get the index for the label by key.""" obs = storage["obs"] # type: ignore # how backwards compatible do we want to be here actually? if isinstance(obs, ArrayTypes): # type: ignore label = obs[idx][obs.dtype.names.index(label_key)] else: labels = obs[label_key] if isinstance(labels, ArrayTypes): # type: ignore label = labels[idx] else: label = labels["codes"][idx] if label == -1: return np.nan if categories is not None: cats = categories else: cats = self._get_categories(storage, label_key) if cats is not None and len(cats) > 0: label = cats[label] if isinstance(label, bytes): label = label.decode("utf-8") return label def get_label_weights( self, obs_keys: str | list[str], scaler: float | None = None, return_categories: bool = False, ): """Get all weights for the given label keys. This counts the number of labels for each label and returns weights for each obs label accoding to the formula `1 / num of this label in the data`. If `scaler` is provided, then `scaler / (scaler + num of this label in the data)`. Args: obs_keys: A key in the ``.obs`` slots or a list of keys. If a list is provided, the labels from the obs keys will be concatenated with ``"__"`` delimeter scaler: Use this number to scale the provided weights. return_categories: If `False`, returns weights for each observation, can be directly passed to a sampler. If `True`, returns a dictionary with unique categories for labels (concatenated if `obs_keys` is a list) and their weights. """ if isinstance(obs_keys, str): obs_keys = [obs_keys] labels_list = [] for label_key in obs_keys: labels_to_str = self.get_merged_labels(label_key).astype(str).astype("O") labels_list.append(labels_to_str) if len(labels_list) > 1: labels = ["__".join(labels_obs) for labels_obs in zip(*labels_list)] else: labels = labels_list[0] counter = Counter(labels) if return_categories: return { k: 1.0 / v if scaler is None else scaler / (v + scaler) for k, v in counter.items() } counts = np.array([counter[label] for label in labels]) if scaler is None: weights = 1.0 / counts else: weights = scaler / (counts + scaler) return weights def get_merged_labels(self, label_key: str): """Get merged labels for `label_key` from all `.obs`.""" labels_merge = [] for i, storage in enumerate(self.storages): with _Connect(storage) as store: if label_key not in self._cache_obs_keys[i]: continue labels = self._get_labels(store, label_key, storage_idx=i) if self.filtered: labels = labels[self.indices_list[i]] labels_merge.append(labels) return np.hstack(labels_merge) def get_merged_categories(self, label_key: str): """Get merged categories for `label_key` from all `.obs`.""" cats_merge = set() for i, storage in enumerate(self.storages): with _Connect(storage) as store: if label_key not in self._cache_obs_keys[i]: continue if label_key in self._cache_cats: cats = self._cache_cats[label_key][i] else: cats = self._get_categories(store, label_key) if cats is not None: cats = _decode(cats) if isinstance(cats[0], bytes) else cats cats_merge.update(cats) else: codes = self._get_codes(store, label_key) codes = _decode(codes) if isinstance(codes[0], bytes) else codes cats_merge.update(codes) return sorted(cats_merge) def _get_categories(self, storage: StorageType, label_key: str): """Get categories.""" obs = storage["obs"] # type: ignore if isinstance(obs, ArrayTypes): # type: ignore cat_key_uns = f"{label_key}_categories" if cat_key_uns in storage["uns"]: # type: ignore return storage["uns"][cat_key_uns] # type: ignore else: return None else: if "__categories" in obs: cats = obs["__categories"] if label_key in cats: return cats[label_key] else: return None if label_key not in obs: return None labels = obs[label_key] if isinstance(labels, GroupTypes): # type: ignore if "categories" in labels: return labels["categories"] else: return None else: if "categories" in labels.attrs: return labels.attrs["categories"] else: return None return None def _get_codes(self, storage: StorageType, label_key: str): """Get codes.""" obs = storage["obs"] # type: ignore if isinstance(obs, ArrayTypes): # type: ignore label = obs[label_key] else: label = obs[label_key] if isinstance(label, ArrayTypes): # type: ignore return label[...] else: return label["codes"][...] def _get_labels( self, storage: StorageType, label_key: str, storage_idx: int | None = None ): """Get labels.""" codes = self._get_codes(storage, label_key) labels = _decode(codes) if isinstance(codes[0], bytes) else codes if storage_idx is not None and label_key in self._cache_cats: cats = self._cache_cats[label_key][storage_idx] else: cats = self._get_categories(storage, label_key) if cats is not None: cats = _decode(cats) if isinstance(cats[0], bytes) else cats # NaN is coded as -1 nans = labels == -1 labels = cats[labels] # detect and replace nans if nans.any(): labels[nans] = np.nan return labels def close(self): """Close connections to array streaming backend. No effect if `parallel=True`. """ for storage in self.storages: if hasattr(storage, "close"): storage.close() for conn in self.conns: if hasattr(conn, "close"): conn.close() self._closed = True @property def closed(self) -> bool: """Check if connections to array streaming backend are closed. Does not matter if `parallel=True`. """ return self._closed def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() @classmethod def torch_worker_init_fn(cls, worker_id): """`worker_init_fn` for `torch.utils.data.DataLoader`. Improves performance for `num_workers > 1`. """ from torch.utils.data import get_worker_info mapped = get_worker_info().dataset mapped.parallel = False mapped.storages = [] mapped.conns = [] mapped._make_connections(mapped.path_list, parallel=False) ================================================ FILE: lamindb/core/_settings.py ================================================ from __future__ import annotations import os import sys from typing import TYPE_CHECKING import lamindb_setup as ln_setup from lamin_utils import colors, logger from lamindb_setup import settings as setup_settings from lamindb_setup._set_managed_storage import set_managed_storage from lamindb_setup.core._settings_instance import sanitize_git_repo_url from lamindb_setup.core._settings_storage import ( StorageSettings, convert_root_path_to_str, ) from .subsettings._annotation_settings import AnnotationSettings, annotation_settings from .subsettings._creation_settings import CreationSettings, creation_settings if TYPE_CHECKING: from collections.abc import Mapping from pathlib import Path from lamindb_setup.types import AnyPathStr from upath import UPath VERBOSITY_TO_INT = { "error": 0, # 40 "warning": 1, # 30 "success": 2, # 25 "info": 3, # 20 "hint": 4, # 15 "debug": 5, # 10 } VERBOSITY_TO_STR: dict[int, str] = dict( [reversed(i) for i in VERBOSITY_TO_INT.items()] # type: ignore ) def raise_if_storage_managed_by_other_instance(storage) -> None: storage_instance_uid = storage.instance_uid if storage_instance_uid != setup_settings.instance.uid: raise ValueError( f"Storage '{storage.root}' exists in another instance ({storage_instance_uid}), cannot write to it from here." ) class Settings: """Settings. Please use the global `ln.settings` object instead of instantiating this class yourself. """ def __init__(self): self._verbosity_int: int = logger._verbosity self._sync_git_repo: str | None = None def __repr__(self) -> str: # pragma: no cover if "sphinx" in sys.modules: return object.__repr__(self) cls_name = colors.green(self.__class__.__name__) verbosity_color = colors.yellow if self.verbosity == "warning" else colors.green verbosity_str = verbosity_color(self.verbosity) storage_root = self._storage_settings.root_as_str storage_str = colors.italic(storage_root) instance_str = colors.italic(self.instance_uid) track_color = colors.green if self.track_run_inputs else colors.yellow track_str = track_color(str(self.track_run_inputs)) lines = [ f"{cls_name}", f" instance: {instance_str}", f" storage: {storage_str}", f" verbosity: {verbosity_str}", f" track_run_inputs: {track_str}", ] if self.sync_git_repo: repo_name = ( self.sync_git_repo.split("/")[-1] if "/" in self.sync_git_repo else self.sync_git_repo ) lines.append(f" sync_git_repo: {colors.italic(repo_name)}") return "\n".join(lines) @property def creation(self) -> CreationSettings: """SQLRecord creation settings. For example, `ln.settings.creation.search_names = False` will disable searching for records with similar names during creation. """ return creation_settings @property def annotation(self) -> AnnotationSettings: """Artifact annotation settings. For example, `ln.settings.creation.search_names = False` will disable searching for records with similar names during creation. """ return annotation_settings # note: this setting should probably be deprecated soon # warnings could then be filtered with a regular warning mechanism track_run_inputs: bool = True """Track run inputs (default `True`). If this setting is true, an artifact is recorded as run input upon `.load()`, `.cache()` & `.open()` provided :func:`~lamindb.track` was called in the current compute (Python, R) session. If :func:`~lamindb.track` was not called, you receive a warning message upon `.load()`, `.cache()` & `.open()`. If you switch this setting to `False`, you won't see the warning message anymore and no run inputs will be recorded. FAQ: :doc:`/faq/track-run-inputs` """ __using_key: str | None = None _using_storage: str | None = None @property def _using_key(self) -> str | None: """Key for Django database settings.""" return self.__using_key @_using_key.setter def _using_key(self, value: str | None): ln_setup.settings._using_key = value self.__using_key = value @property def _storage_settings(self) -> ln_setup.core.StorageSettings: if self._using_storage is None: storage_settings = ln_setup.settings.storage else: storage_settings = ln_setup.core.StorageSettings(root=self._using_storage) return storage_settings @property def sync_git_repo(self) -> str | None: """Sync transforms with scripts in git repository. If set, scripts will be synced with the specified git repository. Example:: ln.settings.sync_git_repo = https://github.com/laminlabs/schmidt22 You can also pass the git repo URL via the environment variable `LAMINDB_SYNC_GIT_REPO`:: export LAMINDB_SYNC_GIT_REPO=https://github.com/laminlabs/schmidt22 You'll then see:: ln.settings.sync_git_repo #> 'https://github.com/laminlabs/schmidt22' """ if self._sync_git_repo is not None: return self._sync_git_repo elif os.environ.get("LAMINDB_SYNC_GIT_REPO") is not None: return sanitize_git_repo_url(os.environ["LAMINDB_SYNC_GIT_REPO"]) else: return setup_settings.instance.git_repo @sync_git_repo.setter def sync_git_repo(self, value) -> None: self._sync_git_repo = sanitize_git_repo_url(value) if not self._sync_git_repo.startswith("https://"): # pragma: nocover raise ValueError("git repository URL must start with 'https://'.") @property def storage(self) -> StorageSettings: """Current default storage location for writes. Examples: Retrieve the storage settings:: ln.settings.storage #> StorageSettings(root='s3://my-bucket') Retrieve the storage root:: ln.settings.storage.root #> UPath('s3://my-bucket') Switch the current default storage location:: ln.settings.storage = "s3://some-bucket" Pass additional `fsspec` `kwargs` via:: kwargs = dict( profile="some_profile", # fsspec arg cache_regions=True # fsspec arg for s3 ) ln.settings.storage = "s3://some-bucket", kwargs """ return self._storage_settings @storage.setter def storage(self, path_kwargs: AnyPathStr | tuple[AnyPathStr, Mapping]): from ..models import Storage if isinstance(path_kwargs, tuple): path, kwargs = path_kwargs if isinstance(kwargs, str): kwargs = {"host": kwargs} else: path, kwargs = path_kwargs, {} root_as_str = convert_root_path_to_str(path) exists = Storage.filter(root=root_as_str).one_or_none() if exists is None: response = input( f"Storage location {root_as_str} does not yet exist in the current instance. Do you want to continue with creating it? (y/n) " ) # logger.warning(f"deprecated call because storage location does **not yet** exist; please create through ln.Storage(root={path}).save()") if response != "y": return None set_managed_storage(path, **kwargs) else: raise_if_storage_managed_by_other_instance(exists) ssettings = StorageSettings( root=exists.root, region=exists.region, uid=exists.uid, instance_id=ln_setup.settings.instance._id, ) ln_setup.settings.instance._storage = ssettings kwargs.pop("host", None) # host is not needed for existing storage settings.storage._set_fs_kwargs(**kwargs) @property def instance_uid(self) -> str: """The `uid` of the current instance.""" return ln_setup.settings.instance.uid @property def cache_dir(self) -> UPath: """Cache root, a local directory to cache cloud files.""" return ln_setup.settings.cache_dir @property def local_storage(self) -> StorageSettings: """An additional local default storage (a path to its root). Is only available if :attr:`~lamindb.setup.core.InstanceSettings.keep_artifacts_local` is enabled. Guide: :doc:`faq/keep-artifacts-local` """ return ln_setup.settings.instance.local_storage @local_storage.setter def local_storage(self, local_root: Path | str): import lamindb as ln # note duplication with storage setter! ssettings = StorageSettings(root=local_root) exists = ln.Storage.filter(root=ssettings.root_as_str).one_or_none() if exists is None: response = input( f"Storage location {ssettings.root_as_str} does not yet exist. Do you want to continue with creating it? (y/n) " ) # logger.warning(f"deprecated call because storage location does **not yet** exist; going forward, please create through ln.Storage(root={path}).save() going forward") if response != "y": return None else: raise_if_storage_managed_by_other_instance(exists) ln_setup.settings.instance.local_storage = local_root @property def verbosity(self) -> str: """Logger verbosity (default `'warning'`). - `'error'`: only show error messages - `'warning'`: also show warning messages - `'success'`: also show success and save messages - `'info'`: also show info messages - `'hint'`: also show hint messages - `'debug'`: also show detailed debug messages """ return VERBOSITY_TO_STR[self._verbosity_int] @verbosity.setter def verbosity(self, verbosity: str | int): if isinstance(verbosity, str): verbosity_int = VERBOSITY_TO_INT[verbosity] else: verbosity_int = verbosity self._verbosity_int = verbosity_int logger.set_verbosity(verbosity_int) settings = Settings() ================================================ FILE: lamindb/core/_sync_git.py ================================================ from __future__ import annotations import subprocess from pathlib import Path from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core.hashing import hash_code from ..core._settings import sanitize_git_repo_url, settings from ..errors import BlobHashNotFound def get_git_repo_from_remote(url: str | None = None, depth: int | None = 10) -> Path: """Clone the git repository if not already cloned. If `depth` is provided, a shallow clone is performed and no tags are fetched. """ repo_url = url or settings.sync_git_repo repo_dir = setup_settings.cache_dir / repo_url.split("/")[-1] if repo_dir.exists(): logger.debug(f"git repo {repo_dir} already exists locally") return repo_dir logger.important( f"running outside of synched git repo, cloning {repo_url} into {repo_dir}" ) args = ["git", "clone", f"{repo_url}.git"] if depth is not None: # if depth is provided, will not fetch tags args += ["--depth", f"{depth}"] result = subprocess.run( args, capture_output=True, cwd=setup_settings.cache_dir, ) if result.returncode != 0 or not repo_dir.exists(): raise RuntimeError(result.stderr.decode()) return repo_dir def check_local_git_repo() -> bool: result = subprocess.run( ["git", "config", "--get", "remote.origin.url"], capture_output=True, ) result_str = result.stdout.decode().strip() if result_str == "": # running-not-in-a-git-repo return False else: remote_url = sanitize_git_repo_url(result_str) if remote_url == settings.sync_git_repo: # running-in-correct-git-repo return True else: logger.warning( f"running in git repo: {remote_url}, expected: {settings.sync_git_repo}" ) return False def get_git_commit_hash(blob_hash: str, repo_dir: Path | None = None) -> str | None: # Fetch all remote branches so that we can also search them fetch_command = ["git", "fetch", "origin", "+refs/heads/*:refs/remotes/origin/*"] subprocess.run(fetch_command, cwd=repo_dir, check=True) # Find the commit containing the blob hash in all branches command = [ "git", "log", "--all", f"--find-object={blob_hash}", "--pretty=format:%H", ] result = subprocess.run( command, capture_output=True, cwd=repo_dir, ) # We just care to find one commit # Hence, we split by new line ("\n") and use the first one commit_hash = result.stdout.decode().split("\n")[0] if not commit_hash or result.returncode == 1: return None default_branch = ( subprocess.run( ["git", "rev-parse", "--abbrev-ref", "origin/HEAD"], capture_output=True, cwd=repo_dir, text=True, ) .stdout.strip() .split("/")[-1] ) # Find all branches containing the commit commit_containing_branches = subprocess.run( ["git", "branch", "--all", "--contains", commit_hash], capture_output=True, cwd=repo_dir, text=True, ).stdout.split("\n") # Clean up branch names and filter out the default branch commit_containing_branches = [ branch.strip().replace("remotes/", "") for branch in commit_containing_branches if branch.strip() ] non_default_branches = [ branch for branch in commit_containing_branches if default_branch not in branch ] if non_default_branches: logger.warning( f"code blob hash {blob_hash} was found in non-default branch(es): {', '.join(non_default_branches)}" ) assert ( # noqa: S101 len(commit_hash) == 40 ), f"commit hash |{commit_hash}| is not 40 characters long" return commit_hash def get_filepath_within_git_repo( commit_hash: str, blob_hash: str, repo_dir: Path | None ) -> str: # repo_dir might not point to the root of the # the git repository because git log --find-object works # from anywhere in the repo, hence, let's get the root repo_root = ( subprocess.run( ["git", "rev-parse", "--show-toplevel"], capture_output=True, cwd=repo_dir, ) .stdout.decode() .strip() ) # Run the git commands separately to circumvent spawning a shell git_command = ["git", "ls-tree", "-r", commit_hash] git_process = subprocess.Popen( git_command, stdout=subprocess.PIPE, cwd=repo_root, ) grep_command = ["grep", "-E", blob_hash] result = subprocess.run( grep_command, stdin=git_process.stdout, capture_output=True, cwd=repo_root, ) # Close the stdout to allow git_process to receive a SIGPIPE if grep_command exits git_process.stdout.close() git_process.wait() command = " ".join(git_command) + " | " + " ".join(grep_command) if result.returncode != 0 and result.stderr.decode() != "": raise RuntimeError(f"{command}\n{result.stderr.decode()}") if len(result.stdout.decode()) == 0: raise RuntimeError( f"Could not find path in git repo {settings.sync_git_repo} running:\n{command}" f"\nin local clone: {repo_root}" ) filepath = result.stdout.decode().split()[-1] return filepath def get_transform_reference_from_git_repo(path: Path) -> str: blob_hash = hash_code(path).hexdigest() commit_hash = None if check_local_git_repo(): repo_dir = None else: repo_dir = get_git_repo_from_remote() commit_hash = get_git_commit_hash(blob_hash, repo_dir=repo_dir) if commit_hash is None: if repo_dir is None: repo_dir = Path.cwd() raise BlobHashNotFound( f"❌ Did not find blob hash {blob_hash} in git repo: {settings.sync_git_repo}\n" f"Did you commit & push the script to the remote repo? -> {path}" ) gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, repo_dir) reference = f"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}" return reference def get_and_validate_git_metadata( url: str, path: str, version: str | None = None, branch: str | None = None, ) -> tuple[str, str]: """Get metadata from a git repository. Args: url: Git repository URL (e.g., "https://github.com/user/repo") path: Path to the main script within the repository version: Optional version/tag to checkout branch: Optional branch name (defaults to repository's default branch) Returns: Dictionary containing: - commit_hash: The current commit hash - url: The repository URL - main_script: Path to the main script - revision: The version/tag (if provided) - branch: The branch name Raises: RuntimeError: If git operations fail FileNotFoundError: If the specified path does not exist in the repository """ url = sanitize_git_repo_url(url) repo_dir = get_git_repo_from_remote(url, depth=None) # Determine the branch to use if branch is None: # Get the default branch if not specified result_str = subprocess.run( ["git", "rev-parse", "--abbrev-ref", "origin/HEAD"], capture_output=True, cwd=repo_dir, text=True, ) if result_str.returncode == 0: branch = result_str.stdout.strip().split("/")[-1] else: branch = "main" # fallback to main # Fetch the latest changes subprocess.run( ["git", "fetch", "origin"], capture_output=True, cwd=repo_dir, check=True, ) # Checkout the specified version or branch if version is not None: # Version takes precedence - checkout the tag/version result = subprocess.run( ["git", "checkout", version], capture_output=True, cwd=repo_dir, ) if result.returncode != 0: raise ValueError( f"Failed to checkout version {version}: {result.stderr.decode()}" ) logger.info(f"checked out version {version}") else: # Checkout the branch result = subprocess.run( ["git", "checkout", f"origin/{branch}"], capture_output=True, cwd=repo_dir, ) if result.returncode != 0: raise ValueError( f"Failed to checkout branch {branch}: {result.stderr.decode()}" ) logger.info(f"checked out branch {branch}") # Get the current commit hash result_str = subprocess.run( ["git", "rev-parse", "HEAD"], capture_output=True, cwd=repo_dir, text=True, ) if result_str.returncode != 0: raise RuntimeError(f"Failed to get commit hash: {result_str.stderr}") commit_hash = result_str.stdout.strip() assert ( # noqa: S101 len(commit_hash) == 40 ), f"commit hash |{commit_hash}| is not 40 characters long" # Verify that the path exists as a file in the repository file_path = repo_dir / path if not file_path.exists(): raise FileNotFoundError(f"Path '{path}' does not exist in repository {url}") if not file_path.is_file(): raise FileNotFoundError( f"Path '{path}' exists but is not a file in repository {url}" ) return url, commit_hash ================================================ FILE: lamindb/core/_track_environment.py ================================================ from __future__ import annotations import subprocess import sys from typing import TYPE_CHECKING import lamindb_setup as ln_setup from lamin_utils import logger if TYPE_CHECKING: from lamindb.models import Run def track_python_environment(run: Run) -> None: env_dir = ln_setup.settings.cache_dir / "environments" / f"run_{run.uid}" filepath = env_dir / "run_env_pip.txt" if not env_dir.exists(): filepath.parent.mkdir(parents=True) # create a requirements.txt # we don't create a conda environment.yml mostly for its slowness try: with open(filepath, "w") as f: result = subprocess.run( [sys.executable, "-m", "pip", "freeze"], stdout=f, ) except OSError as e: result = None logger.warning(f"could not run pip freeze with error {e}") if result is not None and result.returncode == 0: logger.info(f"tracked pip freeze > {str(filepath)}") ================================================ FILE: lamindb/core/exceptions.py ================================================ from ..errors import * # noqa: F403 backward compat ================================================ FILE: lamindb/core/loaders.py ================================================ """Loaders in :class:`lamindb.Artifact.load`. .. autodata:: SUPPORTED_SUFFIXES .. autofunction:: load_fcs .. autofunction:: load_tsv .. autofunction:: load_h5ad .. autofunction:: load_h5mu .. autofunction:: load_html .. autofunction:: load_json .. autofunction:: load_image .. autofunction:: load_svg """ from __future__ import annotations import builtins import re from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, cast from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core.upath import ( create_path, extract_suffix_from_path, infer_filesystem, ) if TYPE_CHECKING: from anndata import AnnData from lamindb_setup.types import AnyPathStr from mudata import MuData from pandas import DataFrame from lamindb.core.storage.types import ScverseDataStructures is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) # tested in lamin-usecases def load_fcs(*args, **kwargs) -> AnnData: """Load an `.fcs` file to `AnnData`.""" try: import readfcs except ImportError: # pragma: no cover raise ImportError("Please install readfcs: pip install readfcs") from None return readfcs.read(*args, **kwargs) # for types below note that local UPaths are subclasses of Path # Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths def load_csv(path: Path | str, **kwargs) -> DataFrame: """Load `.csv` file to `DataFrame`.""" import pandas as pd path_sanitized = Path(path) return pd.read_csv(path_sanitized, **kwargs) def load_parquet(path: Path | str, **kwargs) -> DataFrame: """Load `.parquet` file to `DataFrame`.""" import pandas as pd path_sanitized = Path(path) return pd.read_parquet(path_sanitized, **kwargs) def load_tsv(path: Path | str, **kwargs) -> DataFrame: """Load `.tsv` file to `DataFrame`.""" import pandas as pd path_sanitized = Path(path) return pd.read_csv(path_sanitized, sep="\t", **kwargs) def load_h5ad(filepath: AnyPathStr, **kwargs) -> AnnData: """Load an `.h5ad` file to `AnnData`.""" from anndata import read_h5ad fs, filepath_str = infer_filesystem(filepath) compression = kwargs.pop("compression", "infer") with fs.open(filepath_str, mode="rb", compression=compression) as file: adata = read_h5ad(file, backed=False, **kwargs) return adata def load_h5mu(filepath: Path | str, **kwargs) -> MuData: """Load an `.h5mu` file to `MuData`.""" import mudata as md path_sanitized = Path(filepath) return md.read_h5mu(path_sanitized, **kwargs) def load_zarr(storepath, **kwargs): # type: ignore try: from ..core.storage._zarr import load_zarr as _load_zarr except ImportError: raise ImportError("Please install zarr: pip install 'lamindb[zarr]'") from None return _load_zarr(storepath, **kwargs) def load_html(path: Path | str) -> None | Path | str: """Display `.html` in ipython, otherwise return path.""" if is_run_from_ipython: path_sanitized = Path(path) with path_sanitized.open(encoding="utf-8") as f: html_content = f.read() # Extract the body content using regular expressions body_content = re.findall( r"(?:.*?)", html_content, re.DOTALL ) # Remove any empty body tags if body_content: body_content = body_content[0] body_content = body_content.strip() # type: ignore from IPython.display import HTML, display display(HTML(data=body_content)) return None else: return path def load_json(path: Path | str) -> dict[str, Any] | list[Any]: """Load `.json` to `dict`.""" import json path_sanitized = Path(path) with path_sanitized.open(encoding="utf-8") as f: data = json.load(f) return data def load_yaml(path: Path | str) -> dict[str, Any] | list[Any]: """Load `.yaml` to `dict`.""" import yaml # type: ignore path_sanitized = Path(path) with path_sanitized.open(encoding="utf-8") as f: data = yaml.safe_load(f) return data def load_image(path: Path | str) -> None | Path | str: """Display `.jpg`, `.gif` or `.png` in ipython, otherwise return path.""" if is_run_from_ipython: from IPython.display import Image, display path_sanitized = Path(path) display(Image(filename=path_sanitized.as_posix())) return None else: return path def load_svg(path: Path | str) -> None | Path | str: """Display `.svg` in ipython, otherwise return path.""" if is_run_from_ipython: from IPython.display import SVG, display path_sanitized = Path(path) display(SVG(filename=path_sanitized.as_posix())) return None else: return path def load_txt(path: Path | str) -> str: """Load `.txt` file to `str`.""" path_sanitized = Path(path) return path_sanitized.read_text(encoding="utf-8") def load_rds(path: Path | str) -> Path | str: """Just warn when trying to load `.rds`.""" logger.warning("Please use `laminr` to load `.rds` files") return path FILE_LOADERS = { ".csv": load_csv, ".csv.gz": load_csv, ".csv.tar.gz": load_csv, ".tsv": load_tsv, ".tsv.gz": load_tsv, ".tsv.tar.gz": load_tsv, ".h5ad": load_h5ad, ".h5ad.gz": load_h5ad, ".h5ad.tar.gz": load_h5ad, ".parquet": load_parquet, ".fcs": load_fcs, ".zarr": load_zarr, ".anndata.zarr": load_zarr, ".html": load_html, ".json": load_json, ".vitessce.json": load_json, ".yaml": load_yaml, ".h5mu": load_h5mu, ".gif": load_image, ".jpg": load_image, ".png": load_image, ".svg": load_svg, ".rds": load_rds, ".txt": load_txt, ".fasta": load_txt, } SUPPORTED_SUFFIXES = [sfx for sfx in FILE_LOADERS.keys() if sfx != ".rds"] """Suffixes with defined artifact loaders.""" def load_to_memory( filepath: AnyPathStr, **kwargs ) -> DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | AnyPathStr | None: """Load a file into memory. Returns the filepath if no in-memory form is found. May return None in interactive sessions for images. """ filepath = create_path(filepath) suffix = extract_suffix_from_path(filepath) loader = FILE_LOADERS.get(suffix, None) if loader is None: raise NotImplementedError( f"There is no loader for {suffix} files. Use .cache() to get the path." ) filepath = setup_settings.paths.cloud_to_local(filepath, print_progress=True) return cast(Callable[..., Any], loader)(filepath, **kwargs) ================================================ FILE: lamindb/core/storage/__init__.py ================================================ """Storage API. Valid suffixes. .. autodata:: VALID_SUFFIXES Array accessors. .. autoclass:: AnnDataAccessor .. autoclass:: SpatialDataAccessor .. autoclass:: BackedAccessor """ from typing import TYPE_CHECKING, Any from lamindb_setup.core.upath import LocalPathClasses, UPath, infer_filesystem from ._valid_suffixes import VALID_SUFFIXES from .paths import delete_storage if TYPE_CHECKING: from ._anndata_accessor import AnnDataAccessor from ._backed_access import BackedAccessor from ._spatialdata_accessor import SpatialDataAccessor from ._tiledbsoma import save_tiledbsoma_experiment from .objects import infer_suffix, write_to_disk __all__ = [ "AnnDataAccessor", "BackedAccessor", "LocalPathClasses", "SpatialDataAccessor", "UPath", "VALID_SUFFIXES", "delete_storage", "infer_filesystem", "infer_suffix", "save_tiledbsoma_experiment", "write_to_disk", ] _LAZY_EXPORTS = frozenset( { "AnnDataAccessor", "BackedAccessor", "SpatialDataAccessor", "infer_suffix", "save_tiledbsoma_experiment", "write_to_disk", } ) def __getattr__(name: str) -> Any: if name not in _LAZY_EXPORTS: raise AttributeError(f"module {__name__!r} has no attribute {name!r}") attr: Any if name == "AnnDataAccessor": from ._anndata_accessor import AnnDataAccessor as attr elif name == "BackedAccessor": from ._backed_access import BackedAccessor as attr elif name == "SpatialDataAccessor": from ._spatialdata_accessor import SpatialDataAccessor as attr elif name == "save_tiledbsoma_experiment": from ._tiledbsoma import save_tiledbsoma_experiment as attr else: from .objects import infer_suffix, write_to_disk attr = infer_suffix if name == "infer_suffix" else write_to_disk globals()[name] = attr return attr ================================================ FILE: lamindb/core/storage/_anndata_accessor.py ================================================ from __future__ import annotations import inspect from functools import cached_property from importlib.metadata import version as get_version from itertools import chain from typing import TYPE_CHECKING, Callable, Literal, Union import h5py import numpy as np import pandas as pd from anndata import AnnData from anndata._core.index import _normalize_indices from anndata._core.views import _resolve_idx from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5 from anndata._io.specs.registry import ( get_spec, read_elem, read_elem_partial, write_elem, ) from anndata.compat import _read_attr from fsspec.implementations.local import LocalFileSystem from fsspec.utils import infer_compression from lamin_utils import logger from lamindb_setup.core.upath import S3FSMap, infer_filesystem from packaging import version from upath import UPath if TYPE_CHECKING: from collections.abc import Mapping from fsspec.core import OpenFile from lamindb_setup.types import AnyPathStr from lamindb import Artifact anndata_version_parse = version.parse(get_version("anndata")) if anndata_version_parse < version.parse("0.9.0"): from anndata._core.index import Index else: from anndata.compat import Index if anndata_version_parse < version.parse("0.10.0"): if anndata_version_parse < version.parse("0.9.1"): logger.warning( "Full backed capabilities are not available for this version of anndata," " please install anndata>=0.9.1." ) from anndata._core.sparse_dataset import SparseDataset # try csr for groups with no encoding_type class CSRDataset(SparseDataset): @property def format_str(self) -> str: return "csr" def sparse_dataset(group): return SparseDataset(group) else: if anndata_version_parse >= version.parse("0.11.0"): from anndata._core.sparse_dataset import ( # type: ignore _CSRDataset as CSRDataset, ) else: from anndata._core.sparse_dataset import CSRDataset # type: ignore from anndata._core.sparse_dataset import ( BaseCompressedSparseDataset as SparseDataset, ) from anndata._core.sparse_dataset import sparse_dataset # type: ignore def _check_group_format(*args): pass CSRDataset._check_group_format = _check_group_format # zarr and CSRDataset have problems with full selection def _subset_sparse(sparse_ds: CSRDataset | SparseDataset, indices): has_arrays = isinstance(indices[0], np.ndarray) or isinstance( indices[1], np.ndarray ) if not has_arrays and indices == (slice(None), slice(None)): return sparse_ds.to_memory() else: return sparse_ds[indices] def get_module_name(obj): return inspect.getmodule(obj).__name__.partition(".")[0] def _records_to_df(obj): if isinstance(obj, pd.DataFrame): return obj if hasattr(obj, "dtype") and obj.dtype.names is not None: formats = [] for name, (dt, _) in obj.dtype.fields.items(): if dt.char == "S": new_dt = str(dt).replace("S", "U") else: new_dt = dt formats.append((name, new_dt)) df = pd.DataFrame(obj.astype(formats, copy=False)) for index_name in ("index", "_index"): if index_name in df.columns: return df.set_index(index_name) return df else: return obj class AccessRegistry: def __init__(self): self._registry = {} self._openers = {} def register_open(self, module: str): def wrapper(func: Callable): self._openers[module] = func return func return wrapper def open(self, module: str, *args, **kwargs): if module in self._openers: return self._openers[module](*args, **kwargs) else: raise ValueError(f"Module {module} not found, please install it.") def register(self, module: str): def wrapper(func: Callable): func_name = func.__name__ if func_name not in self._registry: self._registry[func_name] = {} self._registry[func_name][module] = func return func return wrapper def __getattr__(self, func_name: str): def wrapper(*args, **kwargs): func_registry = self._registry[func_name] for arg in chain(args, kwargs.values()): arg_module = get_module_name(arg) if arg_module in func_registry: return func_registry[arg_module](*args, **kwargs) raise ValueError(f"{func_name} is not registered for this module.") return wrapper # storage specific functions should be registered and called through the registry registry = AccessRegistry() @registry.register_open("h5py") def open(filepath: AnyPathStr, mode: str = "r", compression: str | None = "infer"): fs, file_path_str = infer_filesystem(filepath) # we don't open compressed files directly because we need fsspec to uncompress on .open compression = ( infer_compression(file_path_str) if compression == "infer" else compression ) if isinstance(fs, LocalFileSystem) and compression is None: assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101 return None, h5py.File(file_path_str, mode=mode) if mode == "r": conn_mode = "rb" elif mode == "w": conn_mode = "wb" elif mode == "a": conn_mode = "ab" else: raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.") conn = fs.open(file_path_str, mode=conn_mode, compression=compression) try: storage = h5py.File(conn, mode=mode) except Exception as e: conn.close() raise e return conn, storage @registry.register("h5py") def read_dataframe(elem: h5py.Dataset | h5py.Group): if isinstance(elem, h5py.Dataset): return read_dataframe_legacy_h5(elem) else: return read_elem(elem) @registry.register("h5py") def safer_read_partial(elem, indices): is_dataset = isinstance(elem, h5py.Dataset) indices_inverse: list | None = None encoding_type = get_spec(elem).encoding_type # h5py selection for datasets requires sorted indices if is_dataset or encoding_type == "dataframe": indices_increasing = [] indices_inverse = [] for indices_dim in indices: # should be integer or bool # ignore bool or increasing unique integers if ( isinstance(indices_dim, np.ndarray) and indices_dim.dtype != "bool" and not np.all(np.diff(indices_dim) > 0) ): idx_unique, idx_inverse = np.unique(indices_dim, return_inverse=True) indices_increasing.append(idx_unique) indices_inverse.append(idx_inverse) else: indices_increasing.append(indices_dim) indices_inverse.append(None) indices = tuple(indices_increasing) if all(idx is None for idx in indices_inverse): indices_inverse = None result = None if encoding_type == "": if is_dataset: dims = len(elem.shape) if dims == 2: result = elem[indices] elif dims == 1: if indices[0] == slice(None): result = elem[indices[1]] elif indices[1] == slice(None): result = elem[indices[0]] elif isinstance(elem, h5py.Group): try: ds = CSRDataset(elem) result = _subset_sparse(ds, indices) except Exception as e: logger.debug( f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}" ) if result is None: raise ValueError( "Can not get a subset of the element of type" f" {type(elem).__name__} with an empty spec." ) else: result = read_elem_partial(elem, indices=indices) if indices_inverse is None: return result else: if indices_inverse[0] is None: if len(result.shape) == 2: return result[:, indices_inverse[1]] else: return result[indices_inverse[1]] elif indices_inverse[1] is None: if isinstance(result, pd.DataFrame): return result.iloc[indices_inverse[0]] else: return result[indices_inverse[0]] else: return result[tuple(indices_inverse)] @registry.register("h5py") def keys(storage: h5py.File): attrs_keys: dict[str, list] = {} for attr in storage.keys(): if attr == "X": continue attr_obj = storage[attr] if attr in ("obs", "var") and isinstance(attr_obj, h5py.Dataset): keys = list(attr_obj.dtype.fields.keys()) else: keys = list(attr_obj.keys()) if len(keys) > 0: attrs_keys[attr] = keys return attrs_keys ArrayTypes = [h5py.Dataset] GroupTypes = [h5py.Group] StorageTypes = [h5py.File] ZARR_INSTALLED = False try: import zarr ZARR_INSTALLED = True except ImportError: pass if ZARR_INSTALLED: from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr from ._zarr import IS_ZARR_V3, get_zarr_store ArrayTypes.append(zarr.Array) GroupTypes.append(zarr.Group) StorageTypes.append(zarr.Group) @registry.register_open("zarr") def open(filepath: AnyPathStr, mode: Literal["r", "r+", "a", "w", "w-"] = "r"): assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101 store = get_zarr_store(filepath) kwargs = {} if IS_ZARR_V3 and mode != "r": # otherwise unable to write kwargs["use_consolidated"] = False storage = zarr.open(store, mode=mode, **kwargs) # zarr v2 re-initializes the mapper # we need to put back the correct one # S3FSMap is returned from get_zarr_store only for zarr v2 if isinstance(store, S3FSMap): assert not IS_ZARR_V3 # noqa: S101 storage.store.map = store conn = None return conn, storage @registry.register("zarr") def read_dataframe(elem: Union[zarr.Array, zarr.Group]): # noqa if isinstance(elem, zarr.Array): return read_dataframe_legacy_zarr(elem) else: return read_elem(elem) @registry.register("zarr") def safer_read_partial(elem, indices): encoding_type = get_spec(elem).encoding_type if encoding_type == "": if isinstance(elem, zarr.Array): dims = len(elem.shape) if dims == 2: return elem.oindex[indices] elif dims == 1: if indices[0] == slice(None): return elem.oindex[indices[1]] elif indices[1] == slice(None): return elem.oindex[indices[0]] elif isinstance(elem, zarr.Group): try: ds = CSRDataset(elem) return _subset_sparse(ds, indices) except Exception as e: logger.debug( f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}" ) raise ValueError( "Can not get a subset of the element of type" f" {type(elem).__name__} with an empty spec." ) else: if encoding_type in ("csr_matrix", "csc_matrix"): ds = sparse_dataset(elem) return _subset_sparse(ds, indices) else: indices = tuple( idim.tolist() if isinstance(idim, np.ndarray) and idim.dtype == "bool" else idim for idim in indices ) return read_elem_partial(elem, indices=indices) # this is needed because accessing zarr.Group.keys() directly is very slow @registry.register("zarr") def keys(storage: zarr.Group): if IS_ZARR_V3: paths = storage._sync_iter(storage.store.list()) else: paths = storage.store.keys() attrs_keys: dict[str, list] = {} obs_var_arrays = [] prefix = storage.path if prefix == "": paths_iter = (path for path in paths) else: prefix += "/" paths_iter = ( path.removeprefix(prefix) for path in paths if path.startswith(prefix) ) for path in paths_iter: if path in (".zattrs", ".zgroup"): continue parts = path.split("/") if len(parts) < 2: continue attr = parts[0] key = parts[1] if attr == "X": continue if attr in ("obs", "var"): if attr in obs_var_arrays: continue if key == ".zarray": attrs_keys.pop(attr, None) obs_var_arrays.append(attr) if attr not in attrs_keys: attrs_keys[attr] = [] if key in (".zattrs", ".zgroup", ".zarray"): continue attr_keys = attrs_keys[attr] if key not in attr_keys: attr_keys.append(key) for attr in obs_var_arrays: attrs_keys[attr] = list(storage[attr].dtype.fields.keys()) return {attr: keys for attr, keys in attrs_keys.items() if len(keys) > 0} ArrayTypes = tuple(ArrayTypes) # type: ignore GroupTypes = tuple(GroupTypes) # type: ignore StorageTypes = tuple(StorageTypes) # type: ignore ArrayType = Union[ArrayTypes] # type: ignore GroupType = Union[GroupTypes] # type: ignore StorageType = Union[StorageTypes] # type: ignore def _to_memory(elem): if isinstance(elem, ArrayTypes): return elem[()] elif isinstance(elem, SparseDataset): return elem.to_memory() else: return elem def _try_backed_full(elem): # think what to do for compatibility with old var and obs if isinstance(elem, ArrayTypes): return elem if isinstance(elem, GroupTypes): encoding_type = get_spec(elem).encoding_type if encoding_type in ("csr_matrix", "csc_matrix"): return sparse_dataset(elem) if "h5sparse_format" in elem.attrs: return sparse_dataset(elem) if encoding_type == "" and "indptr" in elem: return CSRDataset(elem) return read_elem(elem) def _to_index(elem: np.ndarray): if elem.dtype in (np.float64, np.int64): elem = elem.astype(str) return pd.Index(elem) def _safer_read_index(elem): if isinstance(elem, GroupTypes): return _to_index(read_elem(elem[_read_attr(elem.attrs, "_index")])) elif isinstance(elem, ArrayTypes): indices = None for index_name in ("index", "_index"): if index_name in elem.dtype.names: indices = elem[index_name] break if indices is not None and len(indices) > 0: if isinstance(indices[0], bytes): indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices) return _to_index(indices) else: raise ValueError("Indices not found.") else: raise ValueError(f"Unknown elem type {type(elem)} when reading indices.") class _MapAccessor: def __init__(self, elem, name, indices=None): self.elem = elem self.indices = indices self.name = name def __getitem__(self, key): if self.indices is None: return _try_backed_full(self.elem[key]) else: return registry.safer_read_partial(self.elem[key], indices=self.indices) def keys(self): return list(self.elem.keys()) def __repr__(self): """Description of the _MapAccessor object.""" descr = f"Accessor for the AnnData attribute {self.name}" descr += f"\n with keys: {self.keys()}" return descr def _safer_read_df(elem, indices=None): if indices is not None: obj = registry.safer_read_partial(elem, indices=indices) df = _records_to_df(obj) else: df = registry.read_dataframe(elem) if df.index.dtype in (np.float64, np.int64): df.index = df.index.astype(str) return df class _AnnDataAttrsMixin: storage: StorageType _attrs_keys: Mapping[str, list] @cached_property def obs(self) -> pd.DataFrame | None: if "obs" not in self._attrs_keys: return None indices = getattr(self, "indices", None) return _safer_read_df( self.storage["obs"], # type: ignore indices=(indices[0], slice(None)) if indices is not None else None, ) @cached_property def var(self) -> pd.DataFrame | None: if "var" not in self._attrs_keys: return None indices = getattr(self, "indices", None) return _safer_read_df( self.storage["var"], # type: ignore indices=(indices[1], slice(None)) if indices is not None else None, ) @cached_property def uns(self): if "uns" not in self._attrs_keys: return None return read_elem(self.storage["uns"]) @cached_property def X(self): indices = getattr(self, "indices", None) if indices is not None: return registry.safer_read_partial(self.storage["X"], indices=indices) else: return _try_backed_full(self.storage["X"]) @cached_property def obsm(self): if "obsm" not in self._attrs_keys: return None indices = getattr(self, "indices", None) if indices is not None: indices = (indices[0], slice(None)) return _MapAccessor(self.storage["obsm"], "obsm", indices) @cached_property def varm(self): if "varm" not in self._attrs_keys: return None indices = getattr(self, "indices", None) if indices is not None: indices = (indices[1], slice(None)) return _MapAccessor(self.storage["varm"], "varm", indices) @cached_property def obsp(self): if "obsp" not in self._attrs_keys: return None indices = getattr(self, "indices", None) if indices is not None: indices = (indices[0], indices[0]) return _MapAccessor(self.storage["obsp"], "obsp", indices) @cached_property def varp(self): if "varp" not in self._attrs_keys: return None indices = getattr(self, "indices", None) if indices is not None: indices = (indices[1], indices[1]) return _MapAccessor(self.storage["varp"], "varp", indices) @cached_property def layers(self): if "layers" not in self._attrs_keys: return None indices = getattr(self, "indices", None) return _MapAccessor(self.storage["layers"], "layers", indices) @property def obs_names(self): return self._obs_names @property def var_names(self): return self._var_names @cached_property def shape(self): return len(self._obs_names), len(self._var_names) def to_dict(self): prepare_adata = {} prepare_adata["X"] = _to_memory(self.X) if "uns" in self._attrs_keys: prepare_adata["uns"] = self.uns for attr in ("obs", "var"): if attr in self._attrs_keys: prepare_adata[attr] = getattr(self, attr) for attr in ("obsm", "varm", "obsp", "varp", "layers"): if attr in self._attrs_keys: prepare_adata[attr] = {} get_attr = getattr(self, attr) for key in self._attrs_keys[attr]: prepare_adata[attr][key] = _to_memory(get_attr[key]) if "raw" in self._attrs_keys: prepare_adata["raw"] = self.raw.to_dict() return prepare_adata def to_memory(self): adata = AnnData(**self.to_dict()) return adata class AnnDataAccessorSubset(_AnnDataAttrsMixin): def __init__(self, storage, indices, attrs_keys, obs_names, var_names, ref_shape): self.storage = storage self.indices = indices self._attrs_keys = attrs_keys self._obs_names, self._var_names = obs_names, var_names self._ref_shape = ref_shape def __getitem__(self, index: Index): """Access a subset of the underlying AnnData object.""" oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names) new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx] if self.indices is not None: oidx = _resolve_idx(self.indices[0], oidx, self._ref_shape[0]) vidx = _resolve_idx(self.indices[1], vidx, self._ref_shape[1]) return type(self)( self.storage, (oidx, vidx), self._attrs_keys, new_obs_names, new_var_names, self._ref_shape, ) def __repr__(self): """Description of the object.""" n_obs, n_vars = self.shape descr = f"{type(self).__name__} object with n_obs × n_vars = {n_obs} × {n_vars}" for attr, keys in self._attrs_keys.items(): descr += f"\n {attr}: {keys}" return descr @cached_property def raw(self): if "raw" not in self._attrs_keys: return None prepare_indices = None if self.indices is not None: oidx = self.indices[0] if isinstance(oidx, np.ndarray) or oidx != slice(None): prepare_indices = oidx, slice(None) return AnnDataRawAccessor( self.storage["raw"], prepare_indices, None, self._obs_names, None, self._ref_shape[0], ) class AnnDataRawAccessor(AnnDataAccessorSubset): def __init__( self, storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape ): var_raw = storage_raw["var"] if var_names is None: var_names = _safer_read_index(var_raw) if isinstance(ref_shape, int): ref_shape = ref_shape, len(var_names) elif isinstance(ref_shape, tuple) and len(ref_shape) < 2: ref_shape = ref_shape[0], len(var_names) if attrs_keys is None: attrs_keys = {} if isinstance(var_raw, ArrayTypes): attrs_keys["var"] = list(var_raw.dtype.fields.keys()) else: # for some reason list(var_raw.keys()) is very slow for zarr # maybe also directly get keys from the underlying mapper attrs_keys["var"] = list(var_raw) if "varm" in storage_raw: varm_keys_raw = list(storage_raw["varm"]) if len(varm_keys_raw) > 0: attrs_keys["varm"] = varm_keys_raw super().__init__( storage_raw, indices, attrs_keys, obs_names, var_names, ref_shape ) @property def raw(self): raise AttributeError class AnnDataAccessor(_AnnDataAttrsMixin): """Cloud-backed AnnData.""" def __init__( self, connection: OpenFile | None, storage: StorageType, filename: str, artifact: Artifact | None = None, ): self._conn = connection self.storage = storage self._attrs_keys = registry.keys(self.storage) self._name = filename self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore self._var_names = _safer_read_index(self.storage["var"]) # type: ignore self._artifact = artifact # save artifact to update in write mode self._updated = False # track updates in r+ mode for zarr self._entered = False # check that the context manager is used self._closed = False def close(self): """Closes the connection.""" storage = self.storage connection = self._conn if self._updated and (artifact := self._artifact) is not None: from lamindb.models.artifact import Artifact from lamindb.models.sqlrecord import init_self_from_db # now self._updated can only be True for zarr assert ZARR_INSTALLED # noqa: S101 store = storage.store keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys() # this checks that there consolidated metadata was written before # need to update it # zmetadata is in spatialdata sometimes for some reason if ".zmetadata" in keys or "zmetadata" in keys: zarr.consolidate_metadata(store) new_version = Artifact( artifact.path, revises=artifact, _is_internal_call=True ).save() # note: sets _state.db = "default" init_self_from_db(artifact, new_version) if hasattr(storage, "close"): storage.close() if hasattr(connection, "close"): connection.close() self._closed = True @property def closed(self): return self._closed def __enter__(self): self._entered = True return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def __getitem__(self, index: Index) -> AnnDataAccessorSubset: """Access a subset of the underlying AnnData object.""" oidx, vidx = _normalize_indices(index, self._obs_names, self._var_names) new_obs_names, new_var_names = self._obs_names[oidx], self._var_names[vidx] return AnnDataAccessorSubset( self.storage, (oidx, vidx), self._attrs_keys, new_obs_names, new_var_names, self.shape, ) def __repr__(self): """Description of the AnnDataAccessor object.""" n_obs, n_vars = self.shape descr = f"AnnDataAccessor object with n_obs × n_vars = {n_obs} × {n_vars}" descr += f"\n constructed for the AnnData object {self._name}" for attr, keys in self._attrs_keys.items(): descr += f"\n {attr}: {keys}" return descr @cached_property def raw(self): if "raw" not in self._attrs_keys: return None return AnnDataRawAccessor( self.storage["raw"], None, None, self._obs_names, None, self.shape[0] ) def add_column( self, where: Literal["obs", "var"], col_name: str, col: np.ndarray | pd.Categorical, ): """Add a new column to .obs or .var of the underlying AnnData object.""" df_store = self.storage[where] # type: ignore if getattr(df_store, "read_only", True): raise ValueError( "You can use .add_column(...) only with zarr in a writable mode." ) write_elem(df_store, col_name, col) df_store.attrs["column-order"] = df_store.attrs["column-order"] + [col_name] # remind only once if this wasn't updated before and not in the context manager if not self._updated and not self._entered and self._artifact is not None: logger.important( "Do not forget to call .close() after you finish " f"working with this accessor for {self._name} " "to automatically update the corresponding artifact." ) self._updated = True # reset the cached property # todo: maybe just append the column if the df was already loaded self.__dict__.pop(where, None) # update the cached columns self._attrs_keys[where].append(col_name) # get the number of observations in an anndata object or file fast and safely def _anndata_n_observations(object: AnyPathStr | AnnData) -> int | None: if isinstance(object, AnnData): return object.n_obs try: objectpath = UPath(object) conn_module = None if ".h5ad" in objectpath.suffixes: conn_module = "h5py" elif objectpath.suffix == ".zarr": conn_module = "zarr" conn, storage = registry.open(conn_module, objectpath, mode="r") except Exception as e: logger.warning(f"Could not open {object} to read n_observations: {e}") return None n_observations: int | None = None try: obs = storage["obs"] if isinstance(obs, GroupTypes): # type: ignore if "_index" in obs.attrs: elem_key = _read_attr(obs.attrs, "_index") else: elem_key = next(iter(obs)) elem = obs[elem_key] if isinstance(elem, ArrayTypes): # type: ignore n_observations = elem.shape[0] else: # assume standard obs group n_observations = elem["codes"].shape[0] else: n_observations = obs.shape[0] except Exception as e: logger.warning(f"Could not read n_observations from anndata {object}: {e}") finally: if hasattr(storage, "close"): storage.close() if hasattr(conn, "close"): conn.close() return n_observations ================================================ FILE: lamindb/core/storage/_backed_access.py ================================================ from __future__ import annotations from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Callable, Literal PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc") POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc") if TYPE_CHECKING: from collections.abc import Iterator from fsspec.core import OpenFile from polars import LazyFrame as PolarsLazyFrame from pyarrow.dataset import Dataset as PyArrowDataset from tiledbsoma import Collection as SOMACollection from tiledbsoma import Experiment as SOMAExperiment from tiledbsoma import Measurement as SOMAMeasurement from upath import UPath from lamindb.models.artifact import Artifact from ._anndata_accessor import AnnDataAccessor, StorageType from ._spatialdata_accessor import SpatialDataAccessor # this dynamically creates a subclass of a context manager class # and reassigns it to an instance of the superclass # so that the instance calls finalize on close or exit def _track_writes_factory(obj: Any, finalize: Callable): closed: bool = False tracked_class = obj.__class__ type_dict = {"__doc__": tracked_class.__doc__} if hasattr(tracked_class, "__slots__"): type_dict["__slots__"] = () if hasattr(tracked_class, "__exit__"): def __exit__(self, exc_type, exc_val, exc_tb): nonlocal closed tracked_class.__exit__(self, exc_type, exc_val, exc_tb) if not closed: finalize() closed = True type_dict["__exit__"] = __exit__ if hasattr(tracked_class, "close"): def close(self, *args, **kwargs): nonlocal closed tracked_class.close(self, *args, **kwargs) if not closed: finalize() closed = True type_dict["close"] = close Track = type(tracked_class.__name__ + "Track", (tracked_class,), type_dict) obj.__class__ = Track return obj @dataclass class BackedAccessor: """h5py.File or zarr.Group accessor.""" connection: OpenFile """The connection.""" storage: StorageType """The storage access.""" def backed_access( artifact_or_filepath: Artifact | UPath, mode: str = "r", engine: Literal["pyarrow", "polars"] = "pyarrow", using_key: str | None = None, **kwargs, ) -> ( AnnDataAccessor | SpatialDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | SOMAMeasurement | PyArrowDataset | Iterator[PolarsLazyFrame] ): from lamindb.models import Artifact from .paths import filepath_from_artifact if isinstance(artifact_or_filepath, Artifact): artifact = artifact_or_filepath objectpath, _ = filepath_from_artifact(artifact, using_key=using_key) else: artifact = None objectpath = artifact_or_filepath name = objectpath.name suffix = objectpath.suffix non_gz_suffix = _non_gz_suffix(objectpath.suffixes) if name == "soma" or suffix == ".tiledbsoma": if mode not in {"r", "w"}: raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.") from ._tiledbsoma import _open_tiledbsoma return _open_tiledbsoma(objectpath, mode=mode, **kwargs) # type: ignore elif non_gz_suffix in {".h5", ".hdf5", ".h5ad"}: from ._anndata_accessor import registry conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs) elif suffix == ".zarr": from ._anndata_accessor import registry if mode not in {"r", "r+"}: raise ValueError("`mode` should be either 'r' or 'r+' for zarr.") conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs) if "spatialdata_attrs" in storage.attrs: from ._spatialdata_accessor import SpatialDataAccessor return SpatialDataAccessor(storage, name, artifact) elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and ( df_suffix := df_suffixes.pop() ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES): return _open_dataframe(objectpath, df_suffix, engine, **kwargs) else: raise ValueError( "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix " f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, " f"instead of being {suffix} object." ) import h5py from anndata._io.specs.registry import get_spec from ._anndata_accessor import AnnDataAccessor is_anndata = ( non_gz_suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata" ) if is_anndata: if mode != "r" and isinstance(storage, h5py.Group): raise ValueError("Can only access `hdf5` `AnnData` with mode='r'.") return AnnDataAccessor(conn, storage, name, artifact) else: return BackedAccessor(conn, storage) def _non_gz_suffix(suffixes: list[str]) -> str: len_suffixes = len(suffixes) if len_suffixes == 0: return "" if len_suffixes > 1 and ".gz" in suffixes: if (suffix := suffixes[-2]) != ".tar": return suffix elif len_suffixes > 2: return suffixes[-3] return suffixes[-1] def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]: # it is assumed here that the paths exist # we don't check here that the filesystem is the same # but this is a requirement for pyarrow.dataset.dataset path_list = [] paths_list = paths if isinstance(paths, list) else [paths] for path in paths_list: # assume http is always a file if path.protocol not in {"http", "https"} and path.is_dir(): path_list += [p for p in path.rglob("*") if p.suffix != ""] else: path_list.append(path) return {path.suffix for path in path_list} def _open_dataframe( paths: UPath | list[UPath], suffix: str | None = None, engine: Literal["pyarrow", "polars"] = "pyarrow", **kwargs, ) -> PyArrowDataset | Iterator[PolarsLazyFrame]: from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset if engine not in {"pyarrow", "polars"}: raise ValueError( f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'." ) df_suffix: str if suffix is None: df_suffixes = _flat_suffixes(paths) if len(df_suffixes) > 1: raise ValueError( f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n" "It is not possible to open such stores with pyarrow or polars." ) df_suffix = df_suffixes.pop() else: df_suffix = suffix if engine == "pyarrow" and df_suffix not in PYARROW_SUFFIXES: raise ValueError( f"{df_suffix} files are not supported by pyarrow, " f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}." ) elif engine == "polars" and df_suffix not in POLARS_SUFFIXES: raise ValueError( f"{df_suffix} files are not supported by polars, " f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}." ) polars_without_fsspec = engine == "polars" and not kwargs.get("use_fsspec", False) paths_list = paths if isinstance(paths, list) else [paths] if (engine == "pyarrow" or polars_without_fsspec) and len(paths_list) > 1: # this checks that the filesystem is the same for all paths # this is a requirement of pyarrow.dataset.dataset fs = paths_list[0].fs for path in paths_list[1:]: # this assumes that the filesystems are cached by fsspec if path.fs is not fs: engine_msg = ( "polars engine without passing `use_fsspec=True`" if engine == "polars" else "pyarrow engine" ) raise ValueError( "The collection has artifacts with different filesystems, " f"this is not supported for {engine_msg}." ) return ( _open_pyarrow_dataset(paths, **kwargs) if engine == "pyarrow" else _open_polars_lazy_df(paths, **kwargs) ) ================================================ FILE: lamindb/core/storage/_polars_lazy_df.py ================================================ from __future__ import annotations from contextlib import contextmanager from typing import TYPE_CHECKING from lamindb_setup.core.upath import _ensure_sync_with_fs, get_storage_region if TYPE_CHECKING: from collections.abc import Iterator from polars import LazyFrame as PolarsLazyFrame from upath import UPath POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc") def _polars_options(storepath: UPath) -> dict: polars_options: dict = {} storage_options: dict[str, str | bool] = {} fs = storepath.fs fs.connect() endpoint_url = fs.endpoint_url if endpoint_url is not None: storage_options["aws_virtual_hosted_style_request"] = False storage_options["aws_endpoint_url"] = endpoint_url if endpoint_url.startswith("http://"): storage_options["aws_allow_http"] = True else: storage_options["aws_region"] = get_storage_region(storepath) if fs.anon: storage_options["aws_skip_signature"] = True else: aws_key = fs.key aws_secret = fs.secret aws_token = fs.token if aws_key is not None and aws_secret is not None: storage_options["aws_access_key_id"] = aws_key storage_options["aws_secret_access_key"] = aws_secret if aws_token is not None: storage_options["aws_session_token"] = aws_token else: from aiobotocore.credentials import AioRefreshableCredentials if isinstance( refreshable_credentials := fs.session._credentials, AioRefreshableCredentials, ): refresh_sync = _ensure_sync_with_fs( refreshable_credentials._refresh, fs ) def credential_provider_fn(): # refresh and access the credentials refresh_sync() expiry_time = refreshable_credentials._expiry_time return { "aws_access_key_id": refreshable_credentials._access_key, "aws_secret_access_key": refreshable_credentials._secret_key, "aws_session_token": refreshable_credentials._token, }, int(expiry_time.timestamp()) if expiry_time is not None else None polars_options["credential_provider"] = credential_provider_fn polars_options["storage_options"] = storage_options return polars_options @contextmanager def _open_polars_lazy_df( paths: UPath | list[UPath], use_fsspec: bool = False, **kwargs ) -> Iterator[PolarsLazyFrame]: try: import polars as pl except ImportError as ie: raise ImportError("Please install polars: pip install polars") from ie scans = { ".parquet": pl.scan_parquet, ".csv": pl.scan_csv, ".ndjson": pl.scan_ndjson, ".ipc": pl.scan_ipc, } path_list = [] paths_list = paths if isinstance(paths, list) else [paths] for path in paths_list: # assume http is always a file if path.protocol not in {"http", "https"} and path.is_dir(): path_list += [p for p in path.rglob("*") if p.suffix != ""] else: path_list.append(path) # assume the filesystem is the same for all # it is checked in _open_dataframe path0 = path_list[0] if ( not use_fsspec and path0.protocol == "s3" and "storage_options" not in kwargs and "credential_provider" not in kwargs ): kwargs.update(_polars_options(path0)) open_files = [] try: for path in path_list: open_files.append(path.open(mode="rb") if use_fsspec else path.as_posix()) yield scans[path_list[0].suffix](open_files, **kwargs) finally: if use_fsspec: for open_file in open_files: open_file.close() ================================================ FILE: lamindb/core/storage/_pyarrow_dataset.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import pyarrow.dataset from lamindb_setup.core.upath import LocalPathClasses if TYPE_CHECKING: from pyarrow.dataset import Dataset as PyArrowDataset from upath import UPath PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc") def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset: if isinstance(paths, list): # a single path can be a directory, but a list of paths # has to be a flat list of files paths_str = [] path0 = paths[0] if isinstance(path0, LocalPathClasses): path_to_str = lambda p: p.as_posix() filesystem = None else: path_to_str = lambda p: p.path filesystem = path0.fs for path in paths: if ( getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir() ): paths_str += [path_to_str(p) for p in path.rglob("*") if p.suffix != ""] else: paths_str.append(path_to_str(path)) elif isinstance(paths, LocalPathClasses): paths_str, filesystem = paths.as_posix(), None else: paths_str, filesystem = paths.path, paths.fs return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs) ================================================ FILE: lamindb/core/storage/_spatialdata_accessor.py ================================================ from __future__ import annotations from functools import cached_property from typing import TYPE_CHECKING from ._anndata_accessor import AnnDataAccessor if TYPE_CHECKING: from zarr import Group from lamindb import Artifact class _TablesAccessor: def __init__(self, tables: Group, artifact: Artifact | None = None): self._tables = tables self._artifact = artifact def __getitem__(self, key: str) -> AnnDataAccessor: return AnnDataAccessor( connection=None, storage=self._tables[key], filename=key, artifact=self._artifact, ) def keys(self) -> list[str]: return list(self._tables.keys()) def __repr__(self) -> str: """Description of the _TablesAccessor object.""" descr = ( f"Accessor for the SpatialData attribute tables\n with keys: {self.keys()}" ) return descr class SpatialDataAccessor: """Cloud-backed SpatialData. For now only allows to access `tables`. """ def __init__(self, storage: Group, name: str, artifact: Artifact | None = None): self.storage = storage self._name = name self._artifact = artifact @cached_property def tables(self) -> _TablesAccessor: """tables of the underlying SpatialData object.""" return _TablesAccessor(self.storage["tables"], self._artifact) def __repr__(self): """Description of the SpatialDataAccessor object.""" descr = ( "SpatialDataAccessor object" f"\n constructed for the SpatialData object {self._name}" f"\n with tables: {self.tables.keys()}" ) return descr ================================================ FILE: lamindb/core/storage/_tiledbsoma.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Literal from urllib.parse import urlparse import pandas as pd import pyarrow as pa from anndata import AnnData, read_h5ad from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core.upath import ( LocalPathClasses, _ensure_sync_with_fs, create_path, get_storage_region, ) from packaging import version if TYPE_CHECKING: from lamindb_setup.types import AnyPathStr from tiledbsoma import Collection as SOMACollection from tiledbsoma import Experiment as SOMAExperiment from tiledbsoma import Measurement as SOMAMeasurement from tiledbsoma import SOMATileDBContext from upath import UPath from lamindb.models.artifact import Artifact from lamindb.models.run import Run def _load_h5ad_zarr(objpath: UPath): from lamindb.core.loaders import load_h5ad, load_zarr if objpath.is_dir(): adata = load_zarr(objpath, expected_type="anndata") else: # read only local in backed for now # in principle possible to read remote in backed also if isinstance(objpath, LocalPathClasses): adata = read_h5ad(objpath.as_posix(), backed="r") else: adata = load_h5ad(objpath) return adata class SOMAS3ContextFactory: """Prepares and caches soma.SOMATileDBContext for a given storepath. For S3 storage with federated credentials, credentials are read and refreshed only when the store is opened—i.e. when :meth:`get_context` is called as part of opening the TileDB-SOMA store. They are not updated while a store handle is held open. If credentials expire during a long-lived session, close the store and open it again to refresh. """ def __init__(self, storepath: UPath): from tiledbsoma import SOMATileDBContext self._refreshable_credentials = None fs = storepath.fs fs.connect() self._fs = fs tiledb_config = {} endpoint_url = fs.endpoint_url if endpoint_url is not None: tiledb_config["vfs.s3.region"] = "" tiledb_config["vfs.s3.use_virtual_addressing"] = "false" parsed = urlparse(endpoint_url) tiledb_config["vfs.s3.scheme"] = parsed.scheme tiledb_config["vfs.s3.endpoint_override"] = ( parsed._replace(scheme="").geturl().lstrip("/") ) else: tiledb_config["vfs.s3.region"] = get_storage_region(storepath) if fs.anon: tiledb_config["vfs.s3.no_sign_request"] = "true" tiledb_config["vfs.s3.aws_access_key_id"] = "" tiledb_config["vfs.s3.aws_secret_access_key"] = "" tiledb_config["vfs.s3.aws_session_token"] = "" else: aws_key = fs.key aws_secret = fs.secret aws_token = fs.token if aws_key is not None and aws_secret is not None: tiledb_config["vfs.s3.aws_access_key_id"] = aws_key tiledb_config["vfs.s3.aws_secret_access_key"] = aws_secret if aws_token is not None: tiledb_config["vfs.s3.aws_session_token"] = aws_token else: from aiobotocore.credentials import AioRefreshableCredentials if isinstance( refreshable_credentials := fs.session._credentials, AioRefreshableCredentials, ): self._refreshable_credentials = refreshable_credentials tiledb_config.update(self._extract_refreshable_credentials()) self._context = SOMATileDBContext(tiledb_config=tiledb_config) def _extract_refreshable_credentials(self) -> dict: tiledb_config: dict[str, str] = {} refreshable_credentials = self._refreshable_credentials if refreshable_credentials is None: return tiledb_config # refresh and retrieve the credentials _ensure_sync_with_fs(refreshable_credentials._refresh, self._fs)() tiledb_config["vfs.s3.aws_access_key_id"] = refreshable_credentials._access_key tiledb_config["vfs.s3.aws_secret_access_key"] = ( refreshable_credentials._secret_key ) if (aws_token := refreshable_credentials._token) is not None: tiledb_config["vfs.s3.aws_session_token"] = aws_token return tiledb_config def get_context(self) -> SOMATileDBContext: # update the credentials if needed and return the updated context refreshed_credentials = self._extract_refreshable_credentials() if refreshed_credentials: self._context = self._context.replace(tiledb_config=refreshed_credentials) return self._context def _open_tiledbsoma( storepath: UPath, mode: Literal["r", "w"] = "r" ) -> SOMACollection | SOMAExperiment | SOMAMeasurement: """Open a TileDB-SOMA store for the given path. For S3 paths with federated credentials, credentials are refreshed at open time only (see :class:`SOMAS3ContextFactory`). """ try: import tiledbsoma as soma except ImportError as e: raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e storepath_str = storepath.as_posix() if storepath.protocol == "s3": ctx = SOMAS3ContextFactory(storepath).get_context() # this is a strange bug # for some reason iterdir futher gives incorrect results # if cache is not invalidated # instead of obs and ms it gives ms and ms in the list of names storepath.fs.invalidate_cache() else: ctx = None soma_objects = [obj.name for obj in storepath.iterdir()] if "obs" in soma_objects and "ms" in soma_objects: SOMAType = soma.Experiment elif "var" in soma_objects: SOMAType = soma.Measurement else: SOMAType = soma.Collection return SOMAType.open(storepath_str, mode=mode, context=ctx) def save_tiledbsoma_experiment( # Artifact args adatas: list[AnnData | AnyPathStr], key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, # tiledbsoma.io.from_anndata args measurement_name: str = "RNA", obs_id_name: str = "obs_id", var_id_name: str = "var_id", append_obsm_varm: bool = False, # additional keyword args for tiledbsoma.io.from_anndata **kwargs, ) -> Artifact: """Write `AnnData` to `tiledbsoma.Experiment`. Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`. Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`. Is based on `tiledbsoma.io.from_anndata `__. Args: adatas: `AnnData` objects to write, in-memory or on-disk. key: An optional key to reference the artifact. description: A description. run: The run that creates the artifact. revises: `lamindb.Artifact` with `tiledbsoma.Experiment` to append to. measurement_name: The name of the measurement to store data in `tiledbsoma.Experiment`. obs_id_name: Which `AnnData` `obs` column to use for append mode. var_id_name: Which `AnnData` `var` column to use for append mode. append_obsm_varm: Whether to append `obsm` and `varm` in append mode . **kwargs: Keyword arguments passed to `tiledbsoma.io.from_anndata`. Note: For S3 storage with federated credentials, credentials are updated only when the store is opened for each write step, not while a store handle is held open. Retry if credentials expire during a long write operation. """ try: import tiledbsoma as soma import tiledbsoma.io as soma_io except ImportError as e: raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid from lamindb.models import Artifact from lamindb.models._is_versioned import create_uid from lamindb.models.artifact import get_run run = get_run(run) appending = revises is not None if appending: storepath = revises.path else: uid, _ = create_uid(n_full_id=20) storage_key = auto_storage_key_from_artifact_uid( uid, ".tiledbsoma", overwrite_versions=True ) storepath = setup_settings.storage.root / storage_key if storepath.protocol == "s3": # type: ignore ctx_factory = SOMAS3ContextFactory(storepath) else: ctx_factory = None storepath_str = storepath.as_posix() add_run_uid = True run_uid_dtype = "category" if appending: ctx = None if ctx_factory is None else ctx_factory.get_context() with soma.Experiment.open(storepath_str, mode="r", context=ctx) as store: obs_schema = store["obs"].schema add_run_uid = "lamin_run_uid" in obs_schema.names # this is needed to enable backwards compatibility with tiledbsoma stores # created before PR 2300 if add_run_uid: column_type = obs_schema.types[obs_schema.names.index("lamin_run_uid")] if not isinstance(column_type, pa.DictionaryType): run_uid_dtype = None if add_run_uid and run is None: raise ValueError("Pass `run`") adata_objects = [] for adata in adatas: if isinstance(adata, AnnData): if add_run_uid and adata.is_view: raise ValueError( "Can not write an `AnnData` view, please do `adata.copy()` before passing." ) else: adata = _load_h5ad_zarr(create_path(adata)) if add_run_uid: adata.obs["lamin_run_uid"] = pd.Series( run.uid, index=adata.obs.index, dtype=run_uid_dtype ) adata_objects.append(adata) registration_mapping = kwargs.get("registration_mapping", None) if registration_mapping is None and (appending or len(adata_objects) > 1): ctx = None if ctx_factory is None else ctx_factory.get_context() registration_mapping = soma_io.register_anndatas( experiment_uri=storepath_str if appending else None, adatas=adata_objects, measurement_name=measurement_name, obs_field_name=obs_id_name, var_field_name=var_id_name, append_obsm_varm=append_obsm_varm, context=ctx, ) prepare_experiment = False resize_experiment = False if registration_mapping is not None: soma_version_parsed = version.parse(soma.__version__) if soma_version_parsed < version.parse("1.15.0rc4"): n_observations = len(registration_mapping.obs_axis.data) else: n_observations = registration_mapping.get_obs_shape() prepare_experiment = soma_version_parsed >= version.parse("1.16.2") resize_experiment = not prepare_experiment else: # happens only if not appending and only one adata passed assert len(adata_objects) == 1 # noqa: S101 n_observations = adata_objects[0].n_obs logger.important(f"writing the tiledbsoma store to {storepath_str}") experiment_exists: bool | None = None for adata_obj in adata_objects: # do not recheck if True if not experiment_exists and (resize_experiment or prepare_experiment): ctx = None if ctx_factory is None else ctx_factory.get_context() experiment_exists = soma.Experiment.exists(storepath_str, context=ctx) if experiment_exists: # both can only happen if registration_mapping is not None if resize_experiment: ctx = None if ctx_factory is None else ctx_factory.get_context() soma_io.resize_experiment( storepath_str, nobs=n_observations, nvars=registration_mapping.get_var_shapes(), context=ctx, ) resize_experiment = False elif prepare_experiment: ctx = None if ctx_factory is None else ctx_factory.get_context() registration_mapping.prepare_experiment(storepath_str, context=ctx) prepare_experiment = False registration_mapping_write = ( registration_mapping.subset_for_anndata(adata_obj) if hasattr(registration_mapping, "subset_for_anndata") else registration_mapping ) ctx = None if ctx_factory is None else ctx_factory.get_context() soma_io.from_anndata( storepath_str, adata_obj, measurement_name, context=ctx, obs_id_name=obs_id_name, var_id_name=var_id_name, registration_mapping=registration_mapping_write, **kwargs, ) artifact = Artifact( # type: ignore storepath, key=key, description=description, run=run, revises=revises, _is_internal_call=True, ) artifact.n_observations = n_observations artifact.otype = "tiledbsoma" return artifact.save() # this is less defensive than _anndata_n_observations # this doesn't really catches errors # assumes that the tiledbsoma object is well-formed def _soma_store_n_observations(obj) -> int: if obj.soma_type in {"SOMADataFrame", "SOMASparseNDArray", "SOMADenseNDArray"}: return obj.non_empty_domain()[0][1] + 1 elif obj.soma_type == "SOMAExperiment": return _soma_store_n_observations(obj["obs"]) elif obj.soma_type == "SOMAMeasurement": keys = obj.keys() for slot in ("X", "obsm", "obsp"): if slot in keys: return _soma_store_n_observations(next(iter(obj[slot].values()))) elif obj.soma_type == "SOMACollection": n_obs = 0 for value in obj.values(): n_obs += _soma_store_n_observations(value) return n_obs raise ValueError( "Could not infer the number of observations from the tiledbsoma object." ) def _soma_n_observations(objectpath: UPath) -> int: with _open_tiledbsoma(objectpath, mode="r") as store: return _soma_store_n_observations(store) ================================================ FILE: lamindb/core/storage/_valid_suffixes.py ================================================ from __future__ import annotations from lamindb_setup.core.upath import VALID_COMPOSITE_SUFFIXES, VALID_SIMPLE_SUFFIXES # add new composite suffixes like so VALID_COMPOSITE_SUFFIXES.update( { ".vitessce.json", ".ome.zarr", } ) # can do the same for simple valid suffixes class VALID_SUFFIXES: """Valid suffixes.""" SIMPLE: set[str] = VALID_SIMPLE_SUFFIXES """Simple suffixes.""" COMPOSITE: set[str] = VALID_COMPOSITE_SUFFIXES """Composite suffixes.""" ================================================ FILE: lamindb/core/storage/_zarr.py ================================================ from __future__ import annotations from importlib.metadata import version as get_version from typing import TYPE_CHECKING, Literal import zarr from lamin_utils import logger from lamindb_setup.core.upath import LocalPathClasses, S3FSMap, UPath, create_mapper from packaging import version from lamindb.core._compat import with_package if version.parse(get_version("anndata")) < version.parse("0.11.0"): from anndata._io import read_zarr as read_anndata_zarr else: from anndata.io import read_zarr as read_anndata_zarr if version.parse(zarr.__version__) >= version.parse("3.0.0a0"): IS_ZARR_V3 = True from zarr.abc.store import Store else: IS_ZARR_V3 = False from zarr.storage import Store # noqa if TYPE_CHECKING: from fsspec import FSMap from lamindb_setup.types import AnyPathStr from lamindb.core.storage.types import ScverseDataStructures def get_zarr_store( path: AnyPathStr, *, check: bool = False, create: bool = False ) -> str | S3FSMap | FSMap | Store: """Creates the correct object that can be used to open a zarr file depending on local or remote location.""" storepath, storepath_str = UPath(path), str(path) if isinstance(storepath, LocalPathClasses): store = storepath_str elif IS_ZARR_V3: # todo: also check how to treat non-asynchronous filesystems # zarr has something for this, using fsspec async wrapper # check FsspecStore code store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True)) else: store = create_mapper(storepath.fs, storepath_str, check=check, create=create) return store def _identify_zarr_type_from_storage( storage: zarr.Group, ) -> Literal["anndata", "mudata", "spatialdata", "unknown"]: """Internal helper to identify zarr type from an open storage object.""" try: if storage.attrs.get("encoding-type", "") == "anndata": return "anndata" elif storage.attrs.get("encoding-type", "") == "MuData": return "mudata" elif "spatialdata_attrs" in storage.attrs: return "spatialdata" except Exception as error: logger.warning(f"an exception occurred {error}") return "unknown" def identify_zarr_type( storepath: AnyPathStr, *, check: bool = True ) -> Literal["anndata", "mudata", "spatialdata", "unknown"]: """Identify whether a zarr store is AnnData, SpatialData, or unknown type.""" suffixes = UPath(storepath).suffixes if ".anndata" in suffixes: return "anndata" elif ".mudata" in suffixes: return "mudata" elif ".spatialdata" in suffixes: return "spatialdata" store = get_zarr_store(storepath, check=check) try: storage = zarr.open(store, mode="r") return _identify_zarr_type_from_storage(storage) except Exception as error: logger.warning( f"an exception occured while trying to open the zarr store\n {error}" ) return "unknown" def load_zarr( storepath: AnyPathStr, expected_type: Literal["anndata", "mudata", "spatialdata"] = None, ) -> ScverseDataStructures: """Loads a zarr store and returns the corresponding scverse data structure. Args: storepath: Path to the zarr store expected_type: If provided, ensures the zarr store is of this type ("anndata", "mudata", "spatialdata") and raises ValueError if it's not """ store = get_zarr_store(storepath, check=True) # Open the storage once try: storage = zarr.open(store, mode="r") except Exception as error: raise ValueError(f"Could not open zarr store: {error}") from None actual_type = _identify_zarr_type_from_storage(storage) if expected_type is not None and actual_type != expected_type: raise ValueError( f"Expected zarr store of type '{expected_type}', but found '{actual_type}'" ) match actual_type: case "anndata": scverse_obj = read_anndata_zarr(store) case "mudata": scverse_obj = with_package("mudata", lambda mod: mod.read_zarr(store)) case "spatialdata": scverse_obj = with_package("spatialdata", lambda mod: mod.read_zarr(store)) case "unknown" | _: raise ValueError( "Unable to determine zarr store format and therefore cannot load Artifact." ) return scverse_obj ================================================ FILE: lamindb/core/storage/objects.py ================================================ from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Any, TypeAlias from lamindb.core._compat import ( with_package_obj, ) if TYPE_CHECKING: from pandas import DataFrame from .types import ScverseDataStructures SupportedDataTypes: TypeAlias = DataFrame | ScverseDataStructures else: SupportedDataTypes: TypeAlias = Any def infer_suffix( dmem: SupportedDataTypes, format: str | dict[str, Any] | None = None ) -> str: """Infer LaminDB storage file suffix from a data object.""" has_anndata, anndata_suffix = with_package_obj( dmem, "AnnData", "anndata", lambda obj: _infer_anndata_suffix(format), ) if has_anndata: return anndata_suffix has_dataframe, dataframe_suffix = with_package_obj( dmem, "DataFrame", "pandas", lambda obj: _infer_dataframe_suffix(format), ) if has_dataframe: return dataframe_suffix if with_package_obj( dmem, "MuData", "mudata", lambda obj: True, # Just checking type, not calling any method )[0]: return ".h5mu" has_spatialdata, spatialdata_suffix = with_package_obj( dmem, "SpatialData", "spatialdata", lambda obj: _infer_spatialdata_suffix(format), ) if has_spatialdata: return spatialdata_suffix else: raise NotImplementedError def _infer_anndata_suffix(format: str | dict[str, Any] | None) -> str: assert not isinstance(format, dict) # noqa: S101 if format is not None: # should be `.h5ad`, `.`zarr`, or `.anndata.zarr` if format not in {"h5ad", "zarr", "anndata.zarr"}: raise ValueError( "Error when specifying AnnData storage format, it should be" f" 'h5ad', 'zarr', not '{format}'. Check 'format'" " or the suffix of 'key'." ) return "." + format return ".h5ad" def _infer_dataframe_suffix(format: str | dict[str, Any] | None) -> str: if isinstance(format, str): if format == ".csv": return ".csv" elif isinstance(format, dict): if format.get("suffix") == ".csv": return ".csv" return ".parquet" def _infer_spatialdata_suffix(format: str | dict[str, Any] | None) -> str: if format is None: return ".zarr" if isinstance(format, str) and format in {"spatialdata.zarr", "zarr"}: return format raise ValueError( "Error when specifying SpatialData storage format, it should be" f" 'zarr', 'spatialdata.zarr', not '{format}'. Check 'format'" " or the suffix of 'key'." ) # for types below note that local UPaths are subclasses of Path # Path(UPath(...)) properly coerces local UPaths and throws an error for cloud UPaths def write_to_disk(dmem: SupportedDataTypes, filepath: Path | str, **kwargs) -> None: """Writes the passed in memory data to disk to a specified path.""" if with_package_obj( dmem, "AnnData", "anndata", lambda obj: _write_anndata(obj, filepath, **kwargs), )[0]: return if with_package_obj( dmem, "DataFrame", "pandas", lambda obj: _write_dataframe(obj, filepath, **kwargs), )[0]: return if with_package_obj(dmem, "MuData", "mudata", lambda obj: obj.write(filepath))[0]: return if with_package_obj( dmem, "SpatialData", "spatialdata", lambda obj: obj.write(filepath, overwrite=True), )[0]: return raise NotImplementedError def _write_anndata(dmem: Any, filepath: Path | str, **kwargs) -> None: suffix = Path(filepath).suffix if suffix == ".h5ad": dmem.write_h5ad(filepath, **kwargs) return elif suffix == ".zarr": dmem.write_zarr(filepath, **kwargs) return else: raise NotImplementedError def _write_dataframe(dmem: Any, filepath: Path | str, **kwargs) -> None: suffix = Path(filepath).suffix if suffix == ".csv": dmem.to_csv(filepath, **kwargs) return dmem.to_parquet(filepath, **kwargs) ================================================ FILE: lamindb/core/storage/paths.py ================================================ from __future__ import annotations import shutil from typing import TYPE_CHECKING import fsspec from lamindb_setup.core import StorageSettings from lamindb_setup.core.upath import ( LocalPathClasses, UPath, ) from lamindb.core._settings import settings if TYPE_CHECKING: from lamindb_setup.types import AnyPath, AnyPathStr from lamindb.models.artifact import Artifact AUTO_KEY_PREFIX = ".lamindb/" # add type annotations back asap when re-organizing the module def auto_storage_key_from_artifact(artifact: Artifact): if (real_key := artifact._real_key) is not None: return real_key key = artifact.key if key is None or artifact._key_is_virtual: return auto_storage_key_from_artifact_uid( artifact.uid, artifact.suffix, artifact.overwrite_versions ) return artifact.key def auto_storage_key_from_artifact_uid( uid: str, suffix: str, overwrite_versions: bool ) -> str: assert isinstance(suffix, str) # noqa: S101 Suffix cannot be None. if overwrite_versions: uid_storage = uid[:16] # 16 chars, leave 4 chars for versioning else: uid_storage = uid storage_key = f"{AUTO_KEY_PREFIX}{uid_storage}{suffix}" return storage_key def check_path_is_child_of_root(path: AnyPathStr, root: AnyPathStr) -> bool: if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)): return False path_upath = UPath(path) root_upath = UPath(root) if path_upath.protocol == "s3": endpoint_path = path_upath.storage_options.get("endpoint_url", "") endpoint_root = root_upath.storage_options.get("endpoint_url", "") if endpoint_path != endpoint_root: return False # we don't resolve http links because they can resolve into a different domain # for example into a temporary url if path_upath.protocol not in {"http", "https"}: path_upath = path_upath.resolve() root_upath = root_upath.resolve() # str is needed to eliminate UPath storage_options # which affect equality checks return UPath(str(root_upath)) in UPath(str(path_upath)).parents # returns filepath and root of the storage def attempt_accessing_path( artifact: Artifact, storage_key: str, using_key: str | None = None, access_token: str | None = None, ) -> tuple[UPath, StorageSettings]: # check whether the file is in the default db and whether storage # matches default storage from lamindb.models import Storage if ( artifact._state.db in ("default", None) and artifact.storage_id == settings._storage_settings._id ): if access_token is None: storage_settings = settings._storage_settings else: storage_settings = StorageSettings( settings.storage.root, access_token=access_token ) else: if artifact._state.db not in ("default", None) and using_key is None: storage = Storage.connect(artifact._state.db).get(id=artifact.storage_id) else: storage = Storage.objects.using(using_key).get(id=artifact.storage_id) # find a better way than passing None to instance_settings in the future! storage_settings = StorageSettings(storage.root, access_token=access_token) path = storage_settings.key_to_filepath(storage_key) return path, storage_settings def filepath_from_artifact( artifact: Artifact, using_key: str | None = None ) -> tuple[UPath, StorageSettings | None]: if (local_filepath := getattr(artifact, "_local_filepath", None)) is not None: return local_filepath.resolve(), None storage_key = auto_storage_key_from_artifact(artifact) path, storage_settings = attempt_accessing_path( artifact, storage_key, using_key=using_key ) return path, storage_settings # virtual key is taken into consideration # only if the version is latest def _cache_key_from_artifact_storage( artifact: Artifact, storage_settings: StorageSettings | None ): cache_key = None if ( artifact._key_is_virtual and artifact.key is not None and storage_settings is not None and artifact.is_latest ): root = storage_settings.root cache_key = (root / artifact.key).path # .path does not strip protocol for http # have to do it manually if root.protocol in {"http", "https"}: cache_key = cache_key.split("://", 1)[-1] return cache_key # return filepath and cache_key if needed def filepath_cache_key_from_artifact( artifact: Artifact, using_key: str | None = None ) -> tuple[UPath, str | None]: filepath, storage_settings = filepath_from_artifact(artifact, using_key) if isinstance(filepath, LocalPathClasses): return filepath, None cache_key = _cache_key_from_artifact_storage(artifact, storage_settings) return filepath, cache_key def store_file_or_folder( local_path: AnyPathStr, storage_path: UPath, print_progress: bool = True, **kwargs ) -> None: """Store file or folder (localpath) at storagepath.""" local_path = UPath(local_path) if not isinstance(storage_path, LocalPathClasses): # this uploads files and directories if local_path.is_dir(): create_folder = False try: # if storage_path already exists we need to delete it # if local_path is a directory # to replace storage_path correctly if storage_path.stat().as_info()["type"] == "directory": storage_path.rmdir() else: storage_path.unlink() except (FileNotFoundError, PermissionError): pass else: create_folder = None storage_path.upload_from( local_path, create_folder=create_folder, print_progress=print_progress, **kwargs, ) else: # storage path is local if local_path.resolve().as_posix() == storage_path.resolve().as_posix(): return None storage_path.parent.mkdir(parents=True, exist_ok=True) if local_path.is_file(): shutil.copyfile(local_path, storage_path) else: if storage_path.exists(): shutil.rmtree(storage_path) shutil.copytree(local_path, storage_path) def delete_storage_using_key( artifact: Artifact, storage_key: str, raise_file_not_found_error: bool = True, using_key: str | None = None, ) -> None | str: filepath, _ = attempt_accessing_path(artifact, storage_key, using_key=using_key) return delete_storage( filepath, raise_file_not_found_error=raise_file_not_found_error ) def delete_storage( storagepath: AnyPath, raise_file_not_found_error: bool = True ) -> None | str: """Delete arbitrary artifact.""" if storagepath.is_file(): storagepath.unlink() elif storagepath.is_dir(): if isinstance(storagepath, LocalPathClasses): shutil.rmtree(storagepath) else: storagepath.rmdir() elif raise_file_not_found_error: raise FileNotFoundError(f"{storagepath} is not an existing path!") else: return "did-not-delete" return None ================================================ FILE: lamindb/core/storage/types.py ================================================ """Storage-related type definitions.""" from __future__ import annotations from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from anndata import AnnData from mudata import MuData from spatialdata import SpatialData ScverseDataStructures = AnnData | MuData | SpatialData else: # AnnData | MuData | SpatialData; Any required for union with DataFrame in objects.py ScverseDataStructures = Any ================================================ FILE: lamindb/core/subsettings/__init__.py ================================================ """Sub settings. .. autoclass:: CreationSettings .. autoclass:: AnnotationSettings """ from ._annotation_settings import AnnotationSettings from ._creation_settings import CreationSettings ================================================ FILE: lamindb/core/subsettings/_annotation_settings.py ================================================ class AnnotationSettings: n_max_records: int = 1000 """Maximal number of records to annotate with during automated annotation. If the number of records to annotate exceeds this limit, print a warning and do not annotate. The number is calculated per feature for labels, and per schema for features. """ annotation_settings = AnnotationSettings() ================================================ FILE: lamindb/core/subsettings/_creation_settings.py ================================================ class CreationSettings: search_names: bool = True """Switch off to speed up creating records (default `True`). If `True`, search for alternative names and avoids duplicates. FAQ: :doc:`/faq/idempotency` """ artifact_skip_size_hash: bool = False """To speed up registering high numbers of files (default `False`). This bypasses queries for size and hash to AWS & GCP. It speeds up file creation by about a factor 100. """ artifact_silence_missing_run_warning: bool = False """Silence warning about missing run & transform during artifact creation (default `False`).""" _artifact_use_virtual_keys: bool = True """Treat `key` parameter in :class:`~lamindb.Artifact` as virtual. If `True`, the `key` is **not** used to construct file paths, but file paths are based on the `uid` of artifact. """ creation_settings = CreationSettings() ================================================ FILE: lamindb/curators/__init__.py ================================================ """Curators. High-level curators ------------------- .. autoclass:: DataFrameCurator .. autoclass:: AnnDataCurator .. autoclass:: MuDataCurator .. autoclass:: SpatialDataCurator .. autoclass:: TiledbsomaExperimentCurator Low-level module ---------------- .. autosummary:: :toctree: . core """ from typing import TYPE_CHECKING if TYPE_CHECKING: from .core import ( AnnDataCurator, DataFrameCurator, MuDataCurator, SpatialDataCurator, TiledbsomaExperimentCurator, ) __all__ = [ "AnnDataCurator", "DataFrameCurator", "MuDataCurator", "SpatialDataCurator", "TiledbsomaExperimentCurator", ] _CURATOR_NAMES = frozenset(__all__) def __getattr__(name: str): """Lazy-import curators from core to avoid loading pandas/pandera at import.""" if name in _CURATOR_NAMES: from . import core attr = getattr(core, name) globals()[name] = attr return attr raise AttributeError(f"module {__name__!r} has no attribute {name!r}") ================================================ FILE: lamindb/curators/core.py ================================================ """Curator utilities. .. autoclass:: Curator .. autoclass:: SlotsCurator .. autoclass:: ComponentCurator .. autoclass:: CatVector .. autoclass:: CatLookup .. autoclass:: DataFrameCatManager """ from __future__ import annotations import copy import re from typing import TYPE_CHECKING, Any, Callable import lamindb_setup as ln_setup import numpy as np import pandas as pd import pandera.pandas as pandera from django.db.models import Q from lamin_utils import colors, logger from lamindb_setup.core._docs import doc_args from lamindb_setup.core.upath import LocalPathClasses from lamindb.base.dtypes import check_dtype from lamindb.base.types import FieldAttr # noqa from lamindb.models import ( Artifact, Feature, Run, Schema, SQLRecord, ) from lamindb.models._from_values import _format_values, _from_values from lamindb.models.artifact import ( data_is_scversedatastructure, data_is_soma_experiment, ) from lamindb.models.feature import ( parse_cat_dtype, parse_dtype, parse_filter_string, resolve_relation_filters, ) from lamindb.models.query_set import BasicQuerySet, SQLRecordList from lamindb.models.sqlrecord import HasType from ..errors import InvalidArgument, ValidationError from ..models._from_values import get_organism_record_from_field from ..models.feature import get_record_type_from_uid if TYPE_CHECKING: from collections.abc import Iterable from typing import Any from anndata import AnnData from mudata import MuData from spatialdata import SpatialData from tiledbsoma._experiment import Experiment as SOMAExperiment from lamindb.core.storage.types import ScverseDataStructures def strip_ansi_codes(text): # This pattern matches ANSI escape sequences ansi_pattern = re.compile(r"\x1b\[[0-9;]*m") return ansi_pattern.sub("", text) class CatLookup: """Lookup categories from the reference instance. Args: categoricals: A dictionary of categorical fields to lookup. slots: A dictionary of slot fields to lookup. public: Whether to lookup from the public instance. Defaults to False. Example:: curator = ln.curators.DataFrameCurator(...) curator.cat.lookup()["cell_type"].alveolar_type_1_fibroblast_cell """ def __init__( self, categoricals: list[Feature] | dict[str, FieldAttr], slots: dict[str, FieldAttr] = None, public: bool = False, sources: dict[str, SQLRecord] | None = None, ) -> None: slots = slots or {} if isinstance(categoricals, list): categoricals = { feature.name: parse_dtype(feature._dtype_str)[0]["field"] for feature in categoricals } self._categoricals = {**categoricals, **slots} self._public = public self._sources = sources def __getattr__(self, name): if name in self._categoricals: registry = self._categoricals[name].field.model if self._public and hasattr(registry, "public"): return registry.public(source=self._sources.get(name)).lookup() else: return registry.lookup() raise AttributeError( f'"{self.__class__.__name__}" object has no attribute "{name}"' ) def __getitem__(self, name): if name in self._categoricals: registry = self._categoricals[name].field.model if self._public and hasattr(registry, "public"): return registry.public(source=self._sources.get(name)).lookup() else: return registry.lookup() raise AttributeError( f'"{self.__class__.__name__}" object has no attribute "{name}"' ) def __repr__(self) -> str: if len(self._categoricals) > 0: getattr_keys = "\n ".join( [f".{key}" for key in self._categoricals if key.isidentifier()] ) getitem_keys = "\n ".join( [str([key]) for key in self._categoricals if not key.isidentifier()] ) ref = "public" if self._public else "registries" return ( f"Lookup objects from the {colors.italic(ref)}:\n " f"{colors.green(getattr_keys)}\n " f"{colors.green(getitem_keys)}\n" 'Example:\n → categories = curator.lookup()["cell_type"]\n' " → categories.alveolar_type_1_fibroblast_cell\n\n" "To look up public ontologies, use .lookup(public=True)" ) else: # pragma: no cover return colors.warning("No fields are found!") CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries.""" SLOTS_DOCSTRING = """Access sub curators by slot.""" SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures.""" VALIDATE_DOCSTRING = """Validate dataset against Schema. Raises: lamindb.errors.ValidationError: If validation fails. """ SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact. Args: key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family. description: A description. revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version. run: The run that creates the artifact. Returns: A saved artifact record. """ LAMINDB_COLUMN_PREFIX_REGEX = r"^__lamindb_.*$" class Curator: """Curator base class. A `Curator` object makes it easy to validate, standardize & annotate datasets. See: - :class:`~lamindb.curators.DataFrameCurator` - :class:`~lamindb.curators.AnnDataCurator` - :class:`~lamindb.curators.MuDataCurator` - :class:`~lamindb.curators.SpatialDataCurator` - :class:`~lamindb.curators.TiledbsomaExperimentCurator` """ def __init__( self, dataset: Any, schema: Schema, *, features: dict[str, Any] | None = None, require_saved_schema: bool = True, ) -> None: if not isinstance(schema, Schema): raise InvalidArgument("schema argument must be a Schema record.") if require_saved_schema and schema.pk is None: raise ValueError( "Schema must be saved before curation. Please save it using '.save()'." ) self._artifact: Artifact | None = None self._dataset: Any = None # self._dataset is set below, it is opened or loaded if dataset is an Artifact if isinstance(dataset, Artifact): self._artifact = dataset if self._artifact.otype in { "DataFrame", "AnnData", "MuData", "SpatialData", }: if ( not isinstance(self._artifact.path, LocalPathClasses) and self._artifact.otype == "AnnData" ): try: self._dataset = self._artifact.open(mode="r") logger.important( "opened remote artifact for streaming during validation" ) except Exception as e: logger.warning( f"unable to open remote AnnData Artifact: {e}, falling back to loading into memory" ) if self._dataset is None: logger.important("loading artifact into memory for validation") self._dataset = self._artifact.load(is_run_input=False) else: raise InvalidArgument( f"Cannot load or open artifact of this type: {self._artifact}" ) else: self._dataset = dataset self._schema: Schema = schema self._external_features: dict[str, Any] = features self._is_validated: bool = False @doc_args(VALIDATE_DOCSTRING) def validate(self) -> bool | str: """{}""" # noqa: D415 pass # pragma: no cover @doc_args(SAVE_ARTIFACT_DOCSTRING) def save_artifact( self, *, key: str | None = None, description: str | None = None, revises: Artifact | None = None, run: Run | None = None, ) -> Artifact: """{}""" # noqa: D415 # Note that this docstring has to be consistent with the Artifact() # constructor signature pass # pragma: no cover def __repr__(self) -> str: from lamin_utils import colors if self._schema is not None: # Schema might have different attributes if hasattr(self._schema, "name") and self._schema.name: schema_str = colors.italic(self._schema.name) elif hasattr(self._schema, "uid"): schema_str = colors.italic(f"uid={self._schema.uid}") elif hasattr(self._schema, "id"): schema_str = colors.italic(f"id={self._schema.id}") else: schema_str = colors.italic("unnamed") # Add schema type info if available if hasattr(self._schema, "otype") and self._schema.otype: schema_str += f" ({self._schema.otype})" else: schema_str = colors.warning("None") status_str = "" if self._is_validated: status_str = f", {colors.green('validated')}" else: status_str = f", {colors.yellow('unvalidated')}" cls_name = colors.green(self.__class__.__name__) # Get additional info based on curator type extra_info = "" if hasattr(self, "_slots") and self._slots: # For SlotsCurator and its subclasses slots_count = len(self._slots) if slots_count > 0: slot_names = list(self._slots.keys()) if len(slot_names) <= 3: extra_info = f", slots: {slot_names}" else: extra_info = f", slots: [{', '.join(slot_names[:3])}... +{len(slot_names) - 3} more]" elif ( cls_name == "DataFrameCurator" and hasattr(self, "cat") and hasattr(self.cat, "_categoricals") ): # For DataFrameCurator cat_count = len(getattr(self.cat, "_categoricals", [])) if cat_count > 0: extra_info = f", categorical_features={cat_count}" artifact_info = "" if self._artifact is not None: artifact_info = f", artifact: {colors.italic(self._artifact.uid)}" return ( f"{cls_name}{artifact_info}(Schema: {schema_str}{extra_info}{status_str})" ) @doc_args(SLOTS_DETAILS_DOCSTRING) class SlotsCurator(Curator): """Curator for a dataset with slots. {} Args: dataset: The dataset to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. """ def __init__( self, dataset: Artifact | ScverseDataStructures | SOMAExperiment, schema: Schema, *, features: dict[str, Any] | None = None, require_saved_schema: bool = True, ) -> None: super().__init__( dataset=dataset, schema=schema, features=features, require_saved_schema=require_saved_schema, ) self._slots: dict[str, ComponentCurator] = {} # used for multimodal data structures (not AnnData) # in form of {table/modality_key: var_field} self._var_fields: dict[str, FieldAttr] = {} # in form of {table/modality_key: categoricals} self._cat_vectors: dict[str, dict[str, CatVector]] = {} @property @doc_args(SLOTS_DOCSTRING) def slots(self) -> dict[str, ComponentCurator]: """{}""" # noqa: D415 return self._slots @doc_args(VALIDATE_DOCSTRING) def validate(self) -> None: """{}""" # noqa: D415 if "__external__" in self._schema.slots: validation_schema = self._schema.slots["__external__"] if not self._external_features: if self._artifact is not None and not self._artifact._state.adding: logger.important( "no new external features provided, using existing external features of artifact for validation" ) self._external_features = self._artifact.features.get_values( external_only=True ) else: raise ValidationError( "External features slot is defined in schema but no external features were provided." ) ExperimentalDictCurator( self._external_features, validation_schema ).validate() for slot, curator in self._slots.items(): logger.debug(f"validating slot {slot} ...") curator.validate() # set _is_validated to True as no slot raised an error self._is_validated = True @doc_args(SAVE_ARTIFACT_DOCSTRING) def save_artifact( self, *, key: str | None = None, description: str | None = None, revises: Artifact | None = None, run: Run | None = None, ) -> Artifact: """{}""" # noqa: D415 if not self._is_validated: self.validate() if self._artifact is None: type_mapping = [ ( lambda dataset: isinstance(dataset, pd.DataFrame), Artifact.from_dataframe, ), ( lambda dataset: data_is_scversedatastructure(dataset, "AnnData"), Artifact.from_anndata, ), ( lambda dataset: data_is_scversedatastructure(dataset, "MuData"), Artifact.from_mudata, ), ( lambda dataset: data_is_scversedatastructure( dataset, "SpatialData" ), Artifact.from_spatialdata, ), (data_is_soma_experiment, Artifact.from_tiledbsoma), ] for type_check, af_constructor in type_mapping: if type_check(self._dataset): self._artifact = af_constructor( # type: ignore self._dataset, key=key, description=description, revises=revises, run=run, ) break cat_vectors = {} for curator in self._slots.values(): for key, cat_vector in curator.cat._cat_vectors.items(): cat_vectors[key] = cat_vector self._artifact.schema = self._schema if self._external_features: self._artifact._external_features = self._external_features self._artifact.save() return annotate_artifact( # type: ignore self._artifact, curator=self, cat_vectors=cat_vectors, ) def convert_dict_to_dataframe_for_validation(d: dict, schema: Schema) -> pd.DataFrame: """Convert a dictionary to a DataFrame for validation against a schema.""" df = pd.DataFrame([d]) for feature in schema.members: # we cannot cast a `list[cat[...]]]` to categorical because lists are not hashable if feature.dtype_as_str.startswith("cat"): if feature.name in df.columns: value = df.loc[0, feature.name] if isinstance(value, (list, SQLRecordList, set, BasicQuerySet)): df.attrs[feature.name] = "list_of_categories" else: if isinstance(value, SQLRecord) and value._state.adding: raise ValidationError( f"{value.__class__.__name__} {getattr(value, getattr(value, 'name_field', 'name'), value.uid)} is not saved." ) df[feature.name] = pd.Categorical(df[feature.name]) return df # For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and # https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836 class ComponentCurator(Curator): """Curator for `DataFrame`. Provides all key functionality to validate Pandas DataFrames. This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this class with functionality to validate the `attrs` slot. Args: dataset: The DataFrame-like object to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. slot: Indicate the slot in a composite curator for a composite data structure. """ def __init__( self, dataset: pd.DataFrame | Artifact, schema: Schema, slot: str | None = None, require_saved_schema: bool = True, ) -> None: super().__init__( dataset=dataset, schema=schema, require_saved_schema=require_saved_schema ) categoricals = [] features = [] feature_ids: set[int] = set() if schema.flexible: features += Feature.filter(name__in=self._dataset.keys()).to_list() feature_ids = {feature.id for feature in features} if schema.n_members and schema.n_members > 0: if schema._index_feature_uid is not None: schema_features = [ feature for feature in schema.members.to_list() if feature.uid != schema._index_feature_uid # type: ignore ] else: schema_features = schema.members.to_list() # type: ignore if feature_ids: features.extend( feature for feature in schema_features if feature.id not in feature_ids # type: ignore ) else: features.extend(schema_features) else: assert schema.itype is not None # noqa: S101 pandera_columns = {} self._pandera_schema = None if features or schema._index_feature_uid is not None: # populate features if schema.minimal_set: optional_feature_uids = set(schema.optionals.get_uids()) for feature in features: if schema.minimal_set: required = feature.uid not in optional_feature_uids else: required = False # series.dtype is "object" if the column has lists types, e.g. [["a", "b"], ["a"], ["b"]] dtype_str = feature._dtype_str if ( dtype_str.startswith("list[cat") or self._dataset.attrs.get(feature.name) == "list_of_categories" ): pandera_columns[feature.name] = pandera.Column( dtype=None, checks=pandera.Check( check_dtype("list", feature.nullable), element_wise=False, error=f"Column '{feature.name}' failed dtype check for '{dtype_str}' against (list, nullable={feature.nullable})", ), nullable=feature.nullable, coerce=feature.coerce, required=required, ) elif dtype_str in { "int", "float", "bool", "num", "path", "url", } or dtype_str.startswith("list"): if isinstance(self._dataset, pd.DataFrame): dtype = ( self._dataset[feature.name].dtype if feature.name in self._dataset.keys() else None ) else: dtype = None pandera_columns[feature.name] = pandera.Column( dtype=None, checks=pandera.Check( check_dtype(dtype_str, feature.nullable), element_wise=False, error=f"Column '{feature.name}' failed dtype check for '{dtype_str}': got {dtype}", ), nullable=feature.nullable, coerce=feature.coerce, required=required, ) elif dtype_str == "dict": pandera_columns[feature.name] = pandera.Column( dtype=object, nullable=feature.nullable, coerce=feature.coerce, required=required, checks=pandera.Check( lambda s: s.dropna() .apply(lambda x: isinstance(x, dict)) .all(), error="Non-null values must be dicts", ), ) else: pandera_dtype = ( dtype_str if not dtype_str.startswith("cat") else "category" ) pandera_columns[feature.name] = pandera.Column( pandera_dtype, nullable=feature.nullable, coerce=feature.coerce, required=required, ) if dtype_str.startswith("cat") or dtype_str.startswith("list[cat["): # validate categoricals if the column is required or if the column is present # but exclude the index feature from column categoricals if (required or feature.name in self._dataset.keys()) and ( schema._index_feature_uid is None or feature.uid != schema._index_feature_uid ): categoricals.append(feature) # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame # so, we're typing it as `str` here if schema.index is not None: index = pandera.Index( schema.index._dtype_str if not schema.index._dtype_str.startswith("cat") else str ) else: index = None if schema.maximal_set: # allow any columns starting with "__lamindb" even if maximal_set is True pandera_columns[LAMINDB_COLUMN_PREFIX_REGEX] = pandera.Column( regex=True, required=False, nullable=True ) self._pandera_schema = pandera.DataFrameSchema( pandera_columns, coerce=schema.coerce, strict=schema.maximal_set, ordered=schema.ordered_set, index=index, ) if ( schema.itype == "Composite" ): # backward compat, should be migrated to Feature.name columns_field = Feature.name else: columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"] # in the DataFrameCatManager, we use the # actual columns of the dataset, not the pandera columns # the pandera columns might have additional optional columns self._cat_manager = DataFrameCatManager( self._dataset, columns_field=columns_field, categoricals=categoricals, index=schema.index, slot=slot, maximal_set=schema.maximal_set, schema=schema, ) @property @doc_args(CAT_MANAGER_DOCSTRING) def cat(self) -> DataFrameCatManager: """{}""" # noqa: D415 return self._cat_manager def standardize(self) -> None: """Standardize the dataset. - Adds missing columns for features - Fills missing values for features with default values """ if self._artifact is not None: raise RuntimeError( "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator." ) for feature in self._schema.members: if feature.name not in self._dataset.columns: if feature.default_value is not None or feature.nullable: fill_value = ( feature.default_value if feature.default_value is not None else pd.NA ) dtype_str = feature._dtype_str if dtype_str.startswith("cat"): self._dataset[feature.name] = pd.Categorical( [fill_value] * len(self._dataset) ) else: self._dataset[feature.name] = fill_value logger.important( f"added column {feature.name} with fill value {fill_value}" ) else: raise ValidationError( f"Missing column {feature.name} cannot be added because is not nullable and has no default value" ) else: if feature.default_value is not None: if isinstance( self._dataset[feature.name].dtype, pd.CategoricalDtype ): if ( feature.default_value not in self._dataset[feature.name].cat.categories ): self._dataset[feature.name] = self._dataset[ feature.name ].cat.add_categories(feature.default_value) self._dataset[feature.name] = self._dataset[feature.name].fillna( feature.default_value ) def _cat_manager_validate(self) -> None: self.cat.validate() if self.cat._is_validated: self._is_validated = True else: self._is_validated = False raise ValidationError(self.cat._validate_category_error_messages) @doc_args(VALIDATE_DOCSTRING) def validate(self) -> None: """{}""" # noqa: D415 if self._pandera_schema is not None: try: # first validate through pandera self._pandera_schema.validate(self._dataset, lazy=True) # then validate lamindb categoricals self._cat_manager_validate() except (pandera.errors.SchemaError, pandera.errors.SchemaErrors) as err: self._is_validated = False has_dtype_error = "WRONG_DATATYPE" in str(err) error_msg = str(err) if has_dtype_error: error_msg += " ▶ Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype." raise ValidationError(error_msg) from err else: self._cat_manager_validate() class DataFrameCurator(SlotsCurator): # the example in the docstring is tested in test_curators_quickstart_example """Curator for `DataFrame`. Args: dataset: The DataFrame-like object to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. slot: Indicate the slot in a composite curator for a composite data structure. require_saved_schema: Whether the schema must be saved before curation. Examples: For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`. Here is an example that enforces a minimal set of columns in the dataframe. .. literalinclude:: scripts/curate_dataframe_minimal_errors.py :language: python Under-the-hood, this used the following schema. .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py :language: python Valid features & labels were defined as: .. literalinclude:: scripts/define_mini_immuno_features_labels.py :language: python It is also possible to curate the `attrs` slot. .. literalinclude:: scripts/curate_dataframe_attrs.py :language: python """ def __init__( self, dataset: pd.DataFrame | Artifact, schema: Schema, *, slot: str | None = None, features: dict[str, Any] | None = None, require_saved_schema: bool = True, ) -> None: # loads or opens dataset, dataset may be an artifact super().__init__( dataset=dataset, schema=schema, features=features, require_saved_schema=require_saved_schema, ) # uses open dataset at self._dataset self._atomic_curator = ComponentCurator( dataset=self._dataset, schema=schema, slot=slot, require_saved_schema=require_saved_schema, ) # Handle (nested) attrs if slot is None and schema.slots: for slot_name, slot_schema in schema.slots.items(): if slot_name.startswith("attrs"): path_parts = slot_name.split(":") attrs_dict = getattr(self._dataset, "attrs", None) if attrs_dict is not None: if len(path_parts) == 1: data = attrs_dict else: deeper_keys = path_parts[1:] data = _resolve_schema_slot_path( attrs_dict, deeper_keys, slot_name, "attrs" ) df = convert_dict_to_dataframe_for_validation(data, slot_schema) self._slots[slot_name] = ComponentCurator( df, slot_schema, slot=slot_name, require_saved_schema=require_saved_schema, ) elif slot_name != "__external__": raise ValueError( f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'." ) @property def cat(self) -> DataFrameCatManager: """Manage categoricals by updating registries.""" return self._atomic_curator.cat def standardize(self) -> None: """Standardize the dataset. - Adds missing columns for features - Fills missing values for features with default values """ self._atomic_curator.standardize() for slot_curator in self._slots.values(): slot_curator.standardize() @doc_args(VALIDATE_DOCSTRING) def validate(self) -> None: """{}.""" self._atomic_curator.validate() self._is_validated = self._atomic_curator._is_validated super().validate() @doc_args(SAVE_ARTIFACT_DOCSTRING) def save_artifact( self, *, key=None, description=None, revises=None, run=None ) -> Artifact: """{}.""" if not self._is_validated: self.validate() self._slots["columns"] = self._atomic_curator try: return super().save_artifact( key=key, description=description, revises=revises, run=run ) finally: del self._slots["columns"] class ExperimentalDictCurator(DataFrameCurator): """Curator for `dict` based on `DataFrameCurator`.""" def __init__( self, dataset: dict | Artifact, schema: Schema, slot: str | None = None, require_saved_schema: bool = False, ) -> None: if not isinstance(dataset, dict) and not isinstance(dataset, Artifact): raise InvalidArgument("The dataset must be a dict or dict-like artifact.") if isinstance(dataset, Artifact): assert dataset.otype == "dict", "Artifact must be of otype 'dict'." # noqa: S101 d = dataset.load(is_run_input=False) else: d = dataset df = convert_dict_to_dataframe_for_validation(d, schema) # type: ignore super().__init__( df, schema, slot=slot, require_saved_schema=require_saved_schema ) def _resolve_schema_slot_path( target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str ) -> Any: """Resolve a schema slot path by traversing nested dictionary keys. Args: target_dict: Root dictionary to traverse slot_keys: Sequence of keys defining the paths to traverse slot_name: Schema slot identifier for error context base_path: Base path string for error context Returns: The value at the resolved path """ current = target_dict for key in slot_keys: base_path += f"['{key}']" try: current = current[key] except ( KeyError, TypeError, ): # if not a dict, raises TypeError; if a dict and key not found, raises KeyError available = ( list(current.keys()) if isinstance(current, dict) else "none (not a dict)" ) raise InvalidArgument( f"Schema slot '{slot}' requires keys {base_path} but key '{key}' " f"not found. Available keys at this level: {available}." ) from None return current def _handle_dict_slots( dataset: ScverseDataStructures, slot: str ) -> tuple[pd.DataFrame | None, str | None, str | None]: """Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators. Supports two patterns: - Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key" - Modality dict access: "modality:uns" Args: dataset: The scverse datastructure object slot: The slot path string to parse like 'uns:path:to'. Returns: tuple: (dataframe, modality_key, remaining_slot_path) - dataframe: Single-row DataFrame containing the resolved data - modality_key: Modality identifier if slot targets modality dict, else None - remaining_slot_path: The dict attribute and nested keys as string """ path_parts = slot.split(":") # Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..." if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]: dict_attr = getattr(dataset, path_parts[0], None) if dict_attr is not None: if len(path_parts) == 1: return pd.DataFrame([dict_attr]), None, path_parts[0] deeper_keys = path_parts[1:] data = _resolve_schema_slot_path( dict_attr, deeper_keys, slot, path_parts[0] ) return pd.DataFrame([data]), None, ":".join(path_parts[1:]) # Handle modality dict slots: "modality:uns", "modality:uns:key1:key2" elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]: modality, dict_name = path_parts[0], path_parts[1] try: modality_dataset = dataset[modality] dict_attr = getattr(modality_dataset, dict_name, None) if dict_attr is not None: if len(path_parts) == 2: return pd.DataFrame([dict_attr]), modality, dict_name deeper_keys = path_parts[2:] data = _resolve_schema_slot_path( dict_attr, deeper_keys, slot, f"{modality}.{dict_name}" ) return pd.DataFrame([data]), modality, ":".join(path_parts[1:]) except (KeyError, AttributeError): pass else: raise InvalidArgument( f"Invalid dict slot pattern '{slot}'. Expected formats: " f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'" ) return None, None, None @doc_args(SLOTS_DETAILS_DOCSTRING) class AnnDataCurator(SlotsCurator): """Curator for `AnnData`. {} Args: dataset: The AnnData-like object to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. Examples: Curate Ensembl gene IDs and valid features in obs: .. literalinclude:: scripts/curate_anndata_flexible.py :language: python :caption: curate_anndata_flexible.py Curate `uns` dictionary: .. literalinclude:: scripts/curate_anndata_uns.py :language: python :caption: curate_anndata_uns.py See Also: :meth:`~lamindb.Artifact.from_anndata`. """ def __init__( self, dataset: AnnData | Artifact, schema: Schema, ) -> None: super().__init__(dataset=dataset, schema=schema) if not data_is_scversedatastructure(self._dataset, "AnnData"): raise InvalidArgument("dataset must be AnnData-like.") if schema.otype != "AnnData": raise InvalidArgument("Schema otype must be 'AnnData'.") for slot, slot_schema in schema.slots.items(): if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"): raise ValueError( f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}" ) if slot.startswith("uns"): df, _, _ = _handle_dict_slots(self._dataset, slot) elif slot in {"obs", "var", "var.T"}: df = ( getattr(self._dataset, slot.strip(".T")).T if slot == "var.T" or ( slot == "var" and schema.slots["var"].itype not in {None, "Feature"} ) else getattr(self._dataset, slot) ) self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot) # Handle var index naming for backward compat if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}: logger.warning( "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}" ) self._slots["var"].cat._cat_vectors["var_index"] = self._slots[ "var" ].cat._cat_vectors.pop("columns") self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index" def _assign_var_fields_categoricals_multimodal( modality: str | None, slot_type: str, slot: str, slot_schema: Schema, var_fields: dict[str, FieldAttr], cat_vectors: dict[str, dict[str, CatVector]], slots: dict[str, ComponentCurator], ) -> None: """Assigns var_fields and categoricals for multimodal data curators.""" if modality is not None: var_fields[modality] = None cat_vectors[modality] = {} if slot_type == "var": var_field = parse_cat_dtype(slot_schema.itype, is_itype=True)["field"] if modality is None: # This should rarely/never be used since tables should have different var fields var_fields[slot] = var_field # pragma: no cover else: # Note that this is NOT nested since the nested key is always "var" var_fields[modality] = var_field else: obs_fields = slots[slot].cat._cat_vectors if modality is None: cat_vectors[slot] = obs_fields else: # Note that this is NOT nested since the nested key is always "obs" cat_vectors[modality] = obs_fields @doc_args(SLOTS_DETAILS_DOCSTRING) class MuDataCurator(SlotsCurator): """Curator for `MuData`. {} Args: dataset: The MuData-like object to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. Example: .. literalinclude:: scripts/curate_mudata.py :language: python :caption: curate_mudata.py See Also: :meth:`~lamindb.Artifact.from_mudata`. """ def __init__( self, dataset: MuData | Artifact, schema: Schema, ) -> None: super().__init__(dataset=dataset, schema=schema) if not data_is_scversedatastructure(self._dataset, "MuData"): raise InvalidArgument("dataset must be MuData-like.") if schema.otype != "MuData": raise InvalidArgument("Schema otype must be 'MuData'.") for slot, slot_schema in schema.slots.items(): # Handle slots: "mdata.uns", "modality:uns" if "uns" in slot: df, modality, modality_slot = _handle_dict_slots(self._dataset, slot) else: # Handle slots: "modality:obs", "modality:var" parts = slot.split(":") if len(parts) == 2: modality, modality_slot = parts try: schema_dataset = self._dataset[modality] df = getattr(schema_dataset, modality_slot.rstrip(".T")) except KeyError: raise InvalidArgument( f"Modality '{modality}' not found in MuData" ) from None except AttributeError: raise InvalidArgument( f"Attribute '{modality_slot}' not found on modality '{modality}'" ) from None else: # Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above) modality, modality_slot = None, slot schema_dataset = self._dataset df = getattr(schema_dataset, modality_slot.rstrip(".T")) # Transpose var if necessary if modality_slot == "var" and schema.slots[slot].itype not in { None, "Feature", }: logger.warning( "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}" ) df = df.T elif modality_slot == "var.T": df = df.T self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot) _assign_var_fields_categoricals_multimodal( modality=modality, slot_type=modality_slot, slot=slot, slot_schema=slot_schema, var_fields=self._var_fields, cat_vectors=self._cat_vectors, slots=self._slots, ) self._columns_field = self._var_fields @doc_args(SLOTS_DETAILS_DOCSTRING) class SpatialDataCurator(SlotsCurator): """Curator for `SpatialData`. {} Args: dataset: The SpatialData-like object to validate & annotate. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. Example: .. literalinclude:: scripts/curate_spatialdata.py :language: python :caption: curate_spatialdata.py See Also: :meth:`~lamindb.Artifact.from_spatialdata`. """ def __init__( self, dataset: SpatialData | Artifact, schema: Schema, ) -> None: super().__init__(dataset=dataset, schema=schema) if not data_is_scversedatastructure(self._dataset, "SpatialData"): raise InvalidArgument("dataset must be SpatialData-like.") if schema.otype != "SpatialData": raise InvalidArgument("Schema otype must be 'SpatialData'.") for slot, slot_schema in schema.slots.items(): # Handle slots: "sdata:attrs" if slot.startswith("attrs"): df, table_key, table_slot = _handle_dict_slots(self._dataset, slot) else: parts = slot.split(":") # Handle slots: "tables:table_key:obs", "tables:table_key:var" if len(parts) == 3 and parts[0] == "tables": table_key, table_slot = parts[1], parts[2] try: slot_object = self._dataset.tables[table_key] df = getattr(slot_object, table_slot.rstrip(".T")) except KeyError: raise InvalidArgument( f"Table '{table_key}' not found in sdata.tables" ) from None except AttributeError: raise InvalidArgument( f"Attribute '{table_slot}' not found on table '{table_key}'" ) from None else: # Handle legacy single keys for backward compatibility if len(parts) == 1 and parts[0] != "attrs": logger.warning( f"please prefix slot {slot} with 'attrs:' going forward" ) try: df = pd.DataFrame([self._dataset.attrs[slot]]) table_key = None table_slot = slot except KeyError: raise InvalidArgument( f"Slot '{slot}' not found in sdata.attrs" ) from None else: raise InvalidArgument(f"Unrecognized slot format: {slot}") # Handle var transposition logic if table_slot == "var" and schema.slots[slot].itype not in { None, "Feature", }: logger.warning( "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}" ) df = df.T elif table_slot == "var.T": df = df.T self._slots[slot] = ComponentCurator(df, slot_schema, slot) _assign_var_fields_categoricals_multimodal( modality=table_key, slot_type=table_slot, slot=slot, slot_schema=slot_schema, var_fields=self._var_fields, cat_vectors=self._cat_vectors, slots=self._slots, ) self._columns_field = self._var_fields @doc_args(SLOTS_DETAILS_DOCSTRING) class TiledbsomaExperimentCurator(SlotsCurator): """Curator for `tiledbsoma.Experiment`. {} Args: dataset: The `tiledbsoma.Experiment` object. schema: A :class:`~lamindb.Schema` object that defines the validation constraints. Example: .. literalinclude:: scripts/curate_soma_experiment.py :language: python :caption: curate_soma_experiment.py See Also: :meth:`~lamindb.Artifact.from_tiledbsoma`. """ def __init__( self, dataset: SOMAExperiment | Artifact, schema: Schema, ) -> None: super().__init__(dataset=dataset, schema=schema) if not data_is_soma_experiment(self._dataset): raise InvalidArgument("dataset must be SOMAExperiment-like.") if schema.otype != "tiledbsoma": raise InvalidArgument("Schema otype must be 'tiledbsoma'.") for slot, slot_schema in schema.slots.items(): if slot.startswith("ms:"): _, modality_slot = slot.split(":") schema_dataset = ( self._dataset.ms[modality_slot.removesuffix(".T")] .var.read() .concat() .to_pandas() .drop("soma_joinid", axis=1, errors="ignore") ) self._slots[slot] = ComponentCurator( (schema_dataset.T if modality_slot == "var.T" else schema_dataset), slot_schema, ) else: # global Experiment obs slot modality_slot = slot schema_dataset = ( self._dataset.obs.read() .concat() .to_pandas() .drop(["soma_joinid", "obs_id"], axis=1, errors="ignore") ) self._slots[slot] = ComponentCurator( schema_dataset, slot_schema, ) _assign_var_fields_categoricals_multimodal( modality=slot, # not passing `measurement` here because it's a constant. The slot has the actual modality slot_type=modality_slot, slot=slot, slot_schema=slot_schema, var_fields=self._var_fields, cat_vectors=self._cat_vectors, slots=self._slots, ) self._columns_field = self._var_fields class CatVector: """Vector with categorical values.""" def __init__( self, values_getter: Callable | Iterable[str], # A callable or iterable that returns the values to validate. field: FieldAttr, # The field to validate against. key: str, # The name of the vector to validate. Only used for logging. values_setter: Callable | None = None, # A callable that sets the values. source: SQLRecord | None = None, # The ontology source to validate against. feature: Feature | None = None, cat_manager: DataFrameCatManager | None = None, filter_str: str = "", record_uid: str | None = None, maximal_set: bool = True, # whether unvalidated categoricals cause validation failure. schema: Schema = None, ) -> None: self._values_getter = values_getter self._values_setter = values_setter self._field = field self._key = key self._source = source self._validated: None | list[str] = None self._non_validated: None | list[str] = None self._synonyms: None | dict[str, str] = None self._record_uid = record_uid self._subtype_query_set = None self._cat_manager = cat_manager self.feature = feature self.records = None self._maximal_set = maximal_set self._type_record = None self._registry = self._field.field.model self._field_name = self._field.field.name self._filter_kwargs = {} self._schema = schema if filter_str and filter_str != "unsaved": self._filter_kwargs.update( resolve_relation_filters( parse_filter_string(filter_str), self._registry ) # type: ignore ) if self._registry.__base__.__name__ == "BioRecord": if self._source is not None: self._filter_kwargs["source"] = self._source organism_record = get_organism_record_from_field( field=self._field, organism=self._filter_kwargs.get("organism"), values=self.values, ) if organism_record is not None: self._filter_kwargs["organism"] = organism_record self._filter_kwargs = get_current_filter_kwargs( self._registry, self._filter_kwargs ) # get the dtype associated record based on the record_uid if self._record_uid: self._type_record = get_record_type_from_uid( self._registry, self._record_uid, ) if hasattr(self._registry, "_name_field"): label_ref_is_name = self._field_name == self._registry._name_field else: label_ref_is_name = self._field_name == "name" self.label_ref_is_name = label_ref_is_name @property def values(self): """Get the current values using the getter function.""" if callable(self._values_getter): return self._values_getter() return self._values_getter @values.setter def values(self, new_values): """Set new values using the setter function if available.""" if callable(self._values_setter): self._values_setter(new_values) else: # If values_getter is not callable, it's a direct reference we can update self._values_getter = new_values @property def is_validated(self) -> bool: """Whether the vector is validated.""" # if nothing was validated, something likely is fundamentally wrong # should probably add a setting `at_least_one_validated` result = True if len(self.values) > 0 and len(self.values) == len(self._non_validated): logger.warning(f"no values were validated for {self._key}!") # len(self._non_validated) != 0 # if maximal_set is True, return False # if maximal_set is False, return True # len(self._non_validated) == 0 # return True if len(self._non_validated) != 0: if self._maximal_set: result = False return result def _replace_synonyms(self) -> list[str]: """Replace synonyms in the vector with standardized values.""" def process_value(value, syn_mapper): """Helper function to process values recursively.""" if isinstance(value, list): # Handle list - recursively process each item return [process_value(item, syn_mapper) for item in value] else: # Handle single value return syn_mapper.get(value, value) syn_mapper = self._synonyms # replace the values in df std_values = self.values.map( lambda unstd_val: process_value(unstd_val, syn_mapper) ) # remove the standardized values from self.non_validated non_validated = [i for i in self._non_validated if i not in syn_mapper] if len(non_validated) == 0: self._non_validated = [] else: self._non_validated = non_validated # type: ignore # logging n = len(syn_mapper) if n > 0: syn_mapper_print = _format_values( [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep="" ) s = "s" if n > 1 else "" logger.success( f'standardized {n} synonym{s} in "{self._key}": {colors.green(syn_mapper_print)}' ) return std_values def __repr__(self) -> str: if self._non_validated is None: status = "unvalidated" else: status = ( "validated" if len(self._non_validated) == 0 else f"non-validated ({len(self._non_validated)})" ) field_name = getattr(self._field, "name", str(self._field)) values_count = len(self.values) if hasattr(self.values, "__len__") else "?" return f"CatVector(key='{self._key}', field='{field_name}', values={values_count}, {status})" def _add_validated(self) -> tuple[list, list]: """Save features or labels records in the default instance.""" from lamindb.models.has_parents import keep_topmost_matches from lamindb.models.save import save as ln_save model_field = self._registry.__get_name_with_module__() values = [ value for value in self.values if (isinstance(value, str) and value) or ( isinstance(value, (int, float)) and not isinstance(value, bool) and value == value ) or (isinstance(value, list) and value) or ( isinstance(value, np.ndarray) and value.size > 0 and value.dtype != bool ) ] if not values: return [], [] # if a value is a list, we need to flatten it str_values = _flatten_unique(values) # if values are SQLRecord, we don't need to validate them if all(isinstance(v, SQLRecord) for v in str_values): assert all(v._state.adding is False for v in str_values), ( "All records must be saved." ) self.records = str_values # type: ignore validated_values = str_values # type: ignore return validated_values, [] # get all field specs for union types if self.feature: results = parse_dtype(self.feature._dtype_str) else: results = [None] all_validated = [] all_records = [] remaining_values = str_values for result in results: if not remaining_values: break # pragma: no cover if result is not None: field = result["field"] registry = field.field.model field_name = field.field.name filter_kwargs: dict[str, str | SQLRecord] = {} filter_str = result.get("filter_str", "") if filter_str: parsed_filters = parse_filter_string(filter_str) filter_kwargs.update( resolve_relation_filters(parsed_filters, registry) ) if registry.__base__.__name__ == "BioRecord": organism_record = get_organism_record_from_field( field=field, organism=None, values=remaining_values, ) if organism_record is not None: filter_kwargs["organism"] = organism_record # Merge in self._filter_kwargs (contains cat_filters from Feature) if self._filter_kwargs: filter_kwargs.update(self._filter_kwargs) filter_kwargs = get_current_filter_kwargs(registry, filter_kwargs) else: field = self._field registry = self._registry field_name = self._field_name filter_kwargs = self._filter_kwargs # inspect the default instance and save validated records from public if issubclass(registry, HasType): if self._type_record is None: # When we have a Schema with typed members, # scope the query to the types present in the schema's members (plus untyped features) # to avoid ambiguous matches across different feature types. qs = registry.filter() if self._schema and self._schema.n_members: type_ids = { m.type_id for m in self._schema.members if m.type_id is not None } if type_ids: qs = registry.filter( Q(type_id__in=type_ids) | Q(type_id__isnull=True) ) self._subtype_query_set = qs else: query_sub_types = getattr( self._type_record, f"query_{registry.__name__.lower()}s" ) self._subtype_query_set = query_sub_types() subtype_query_set = ( self._subtype_query_set.filter(**filter_kwargs) if filter_kwargs else self._subtype_query_set ) values_array = np.array(remaining_values) validated_mask = subtype_query_set.validate( values_array, field=field, mute=True ) validated_values, non_validated_values = ( list(set(values_array[validated_mask])), list(set(values_array[~validated_mask])), ) records = subtype_query_set.filter( **{f"{field_name}__in": validated_values} ).to_list() records = keep_topmost_matches(records) else: existing_and_public_records = _from_values( remaining_values, field=field, mute=True, **filter_kwargs, # type: ignore ) existing_and_public_values = [ getattr(r, field_name) for r in existing_and_public_records ] # public records that are not already in the database public_records = [ r for r in existing_and_public_records if r._state.adding ] if len(public_records) > 0: logger.info(f"saving validated records of '{self._key}'") ln_save(public_records) values_saved_public = [ getattr(r, field_name) for r in public_records ] # log the saved public labels # the term "transferred" stresses that this is always in the context of transferring # labels from a public ontology or a different instance to the present instance if len(values_saved_public) > 0: s = "s" if len(values_saved_public) > 1 else "" logger.success( f'added {len(values_saved_public)} record{s} {colors.green("from_public")} with {model_field} for "{self._key}": {_format_values(values_saved_public)}' ) # non-validated records from the default instance non_validated_values = [ i for i in remaining_values if i not in existing_and_public_values ] validated_values = existing_and_public_values records = existing_and_public_records all_validated.extend(validated_values) all_records.extend(records) remaining_values = non_validated_values self.records = all_records # validated values, non-validated values return all_validated, remaining_values def _add_new( self, values: list[str], df: pd.DataFrame | None = None, # remove when all users use schema dtype: str | None = None, **create_kwargs, ) -> None: """Add new labels to the registry.""" from lamindb.models.save import save as ln_save non_validated_records: SQLRecordList[Any] = [] # type: ignore if df is not None and self._registry == Feature: nonval_columns = Feature.inspect(df.columns, mute=True).non_validated non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns]) else: organism_record = self._filter_kwargs.get("organism", None) for value in values: init_kwargs = {self._field_name: value} if self._registry == Feature: init_kwargs["dtype"] = "cat" if dtype is None else dtype if self._type_record is not None: # if type_record is set, we need to set the type for new records init_kwargs["type"] = self._type_record if organism_record is not None: init_kwargs["organism"] = organism_record # here we create non-validated records skipping validation since we already ensured that they don't exist non_validated_records.append( self._registry( **init_kwargs, **create_kwargs, _skip_validation=True ) ) if len(non_validated_records) > 0: ln_save(non_validated_records) model_field = colors.italic(self._registry.__get_name_with_module__()) s = "s" if len(values) > 1 else "" logger.success( f'added {len(values)} record{s} with {model_field} for "{self._key}": {_format_values(values)}' ) def _validate( self, values: list[str], ) -> tuple[list[str], dict]: """Validate ontology terms using LaminDB registries.""" model_field = f"{self._registry.__name__}.{self._field_name}" # get all field specs for union types if self.feature: results = parse_dtype(self.feature._dtype_str) else: results = [{"field": self._field}] non_validated = values syn_mapper: dict[str, str] = {} for result in results: if not non_validated: break field = result["field"] registry = field.field.model filter_kwargs = self._filter_kwargs.copy() filter_str = result.get("filter_str", "") if filter_str: parsed_filters = parse_filter_string(filter_str) filter_kwargs.update(resolve_relation_filters(parsed_filters, registry)) registry_or_queryset = registry if self._subtype_query_set is not None and registry == self._registry: registry_or_queryset = self._subtype_query_set # first inspect against the registry inspect_result = registry_or_queryset.filter(**filter_kwargs).inspect( non_validated, field=field, mute=True, from_source=False, ) # here non_validated includes synonyms and new values non_validated = inspect_result.non_validated syn_mapper.update(inspect_result.synonyms_mapper) # logging messages if self._cat_manager is not None: slot = self._cat_manager._slot else: slot = None in_slot = f" in slot '{slot}'" if slot is not None else "" slot_prefix = f".slots['{slot}']" if slot is not None else "" non_validated_hint_print = ( f"curator{slot_prefix}.cat.add_new_from('{self._key}')" ) n_non_validated = len(non_validated) if n_non_validated == 0: logger.success( f'"{self._key}" is validated against {colors.italic(model_field)}' ) return [], {} else: s = "" if n_non_validated == 1 else "s" print_values = _format_values(non_validated) warning_message = f"{colors.red(f'{n_non_validated} term{s}')} not validated in feature '{self._key}'{in_slot}: {colors.red(print_values)}\n" # log synonyms if any if syn_mapper: s = "" if len(syn_mapper) == 1 else "s" syn_mapper_print = _format_values( [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep="" ) hint_msg = f'.standardize("{self._key}")' warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via: {colors.cyan(hint_msg)}" if n_non_validated > len(syn_mapper): if syn_mapper: warning_message += "\n for remaining terms:\n" check_organism = "" if ( self._registry.__base__.__name__ == "BioRecord" and self._registry.require_organism(field=self._field) ): organism = self._filter_kwargs.get("organism", None) check_organism = f"fix organism '{organism}', " warning_message += f" → {check_organism}fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}" if self._subtype_query_set is not None and self._type_record: warning_message += f"\n → a valid label for subtype '{self._type_record.name}' has to be one of {self._subtype_query_set.to_list('name')}" logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}') logger.warning(warning_message) if self._cat_manager is not None: self._cat_manager._validate_category_error_messages = strip_ansi_codes( warning_message ) return non_validated, syn_mapper def validate(self) -> None: """Validate the vector.""" # add source-validated values to the registry self._validated, self._non_validated = self._add_validated() self._non_validated, self._synonyms = self._validate(values=self._non_validated) def standardize(self) -> None: """Standardize the vector.""" if not hasattr(self._registry, "standardize"): return self.values if self._synonyms is None: self.validate() # get standardized values std_values = self._replace_synonyms() # update non_validated values self._non_validated = [ i for i in self._non_validated if i not in self._synonyms.keys() ] # remove synonyms since they are now standardized self._synonyms = {} # update the values with the standardized values self.values = std_values def add_new(self, **create_kwargs) -> None: """Add new values to the registry.""" if self._non_validated is None: self.validate() if len(self._synonyms) > 0: # raise error because .standardize modifies the input dataset raise ValidationError( "Please run `.standardize()` before adding new values." ) self._add_new( values=self._non_validated, **create_kwargs, ) # remove the non_validated values since they are now registered self._non_validated = [] class DataFrameCatManager: """Manage categoricals by updating registries. This class is accessible from within a `DataFrameCurator` via the `.cat` attribute. If you find non-validated values, you have two options: - new values found in the data can be registered via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.add_new_from` - non-validated values can be accessed via `DataFrameCurator.cat.add_new_from()` :meth:`~lamindb.curators.core.DataFrameCatManager.non_validated` and addressed manually """ def __init__( self, df: pd.DataFrame | Artifact, columns_field: FieldAttr = Feature.name, categoricals: list[Feature] | None = None, sources: dict[str, SQLRecord] | None = None, index: Feature | None = None, slot: str | None = None, maximal_set: bool = False, schema: Schema | None = None, ) -> None: self._non_validated = None self._index = index self._artifact: Artifact = None # pass the dataset as an artifact self._dataset: Any = df # pass the dataset as an AnyPathStr or data object if isinstance(self._dataset, Artifact): self._artifact = self._dataset self._dataset = self._dataset.load(is_run_input=False) self._is_validated: bool = False self._categoricals = categoricals or [] self._non_validated = None self._sources = sources or {} self._columns_field = columns_field self._validate_category_error_messages: str = "" self._cat_vectors: dict[str, CatVector] = {} self._slot = slot self._maximal_set = maximal_set columns = self._dataset.keys() if maximal_set: columns = [ col for col in columns if not re.match(LAMINDB_COLUMN_PREFIX_REGEX, col) ] self._cat_vectors["columns"] = CatVector( values_getter=lambda: columns, # lambda ensures the inplace update values_setter=lambda new_values: setattr( self._dataset, "columns", pd.Index(new_values) ) if isinstance(self._dataset, pd.DataFrame) else None, field=columns_field, key="columns" if isinstance(self._dataset, pd.DataFrame) else "keys", source=self._sources.get("columns"), cat_manager=self, maximal_set=self._maximal_set, filter_str="" if schema.flexible else "unsaved" if schema.id is None else f"schemas__id={schema.id}", schema=schema, ) for feature in self._categoricals: result = parse_dtype(feature._dtype_str)[0] key = feature.name # only create CatVector if the key exists in the DataFrame if key in self._dataset.columns: self._cat_vectors[key] = CatVector( values_getter=lambda k=key: self._dataset[ k ], # Capture key as default argument values_setter=lambda new_values, k=key: self._dataset.__setitem__( k, new_values ), field=result["field"], key=key, source=self._sources.get(key), feature=feature, cat_manager=self, filter_str=result["filter_str"], record_uid=result.get("record_uid"), ) if index is not None and index._dtype_str.startswith("cat"): result = parse_dtype(index._dtype_str)[0] key = "index" self._cat_vectors[key] = CatVector( values_getter=self._dataset.index, values_setter=lambda new_values: setattr( self._dataset, "index", new_values ), field=result["field"], key=key, feature=index, cat_manager=self, filter_str=result["filter_str"], record_uid=result.get("record_uid"), ) @property def non_validated(self) -> dict[str, list[str]]: """Return the non-validated features and labels.""" if self._non_validated is None: raise ValidationError("Please run validate() first!") return { key: cat_vector._non_validated for key, cat_vector in self._cat_vectors.items() if cat_vector._non_validated and key != "columns" } @property def categoricals(self) -> list[Feature]: """The categorical features.""" return self._categoricals def __repr__(self) -> str: cls_name = colors.green(self.__class__.__name__) status_str = ( f"{colors.green('validated')}" if self._is_validated else f"{colors.yellow('unvalidated')}" ) info_parts = [] cat_count = len(self._categoricals) if cat_count > 0: info_parts.append(f"categorical_features={cat_count}") if self._slot: info_parts.append(f"slot: {colors.italic(self._slot)}") info_str = ", ".join(info_parts) if info_str: return f"{cls_name}({info_str}, {status_str})" else: return f"{cls_name}({status_str})" def lookup(self, public: bool = False) -> CatLookup: """Lookup categories. Args: public: If "public", the lookup is performed on the public reference. """ return CatLookup( categoricals=self._categoricals, slots={"columns": self._columns_field}, public=public, sources=self._sources, ) def validate(self) -> bool: """Validate variables and categorical observations.""" self._validate_category_error_messages = "" # reset the error messages validated = True for key, cat_vector in self._cat_vectors.items(): logger.info(f"validating vector {key}") cat_vector.validate() validated &= cat_vector.is_validated self._is_validated = validated self._non_validated = {} # type: ignore if self._index is not None: # cat_vector.validate() populates validated labels # the index should become part of the feature set corresponding to the dataframe if self._cat_vectors["columns"].records is not None: self._cat_vectors["columns"].records.insert(0, self._index) # type: ignore else: self._cat_vectors["columns"].records = [self._index] # type: ignore return self._is_validated def standardize(self, key: str) -> None: """Replace synonyms with standardized values. Modifies the input dataset inplace. Args: key: The key referencing the column in the DataFrame to standardize. """ if self._artifact is not None: raise RuntimeError( "Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator." ) if key == "all": logger.warning( "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!" ) for k in self.non_validated.keys(): self._cat_vectors[k].standardize() else: self._cat_vectors[key].standardize() def add_new_from(self, key: str, **kwargs): """Add validated & new categories. Args: key: The key referencing the slot in the DataFrame from which to draw terms. **kwargs: Additional keyword arguments to pass to create new records """ if len(kwargs) > 0 and key == "all": raise ValueError("Cannot pass additional arguments to 'all' key!") if key == "all": logger.warning( "'all' is deprecated, please pass a single key from `.non_validated.keys()` instead!" ) for k in self.non_validated.keys(): self._cat_vectors[k].add_new(**kwargs) else: self._cat_vectors[key].add_new(**kwargs) def get_current_filter_kwargs( registry: type[SQLRecord], kwargs: dict[str, str | SQLRecord] ) -> dict: """Make sure the source and organism are saved in the same database as the registry.""" db = registry.filter().db filter_kwargs = kwargs.copy() for key, value in kwargs.items(): if isinstance(value, SQLRecord) and value._state.db != "default": if db is None or db == "default": value_default = copy.copy(value) value_default.save() filter_kwargs[key] = value_default return filter_kwargs def annotate_artifact( artifact: Artifact, *, curator: SlotsCurator | None = None, cat_vectors: dict[str, CatVector] | None = None, ) -> Artifact: from .. import settings from ..models.artifact import add_labels from ..models.schema import ArtifactSchema if cat_vectors is None: cat_vectors = {} # annotate with labels for key, cat_vector in cat_vectors.items(): if ( cat_vector._registry == Feature or key == "columns" or key == "var_index" or cat_vector.records is None ): continue if len(cat_vector.records) > settings.annotation.n_max_records: logger.important( f"not annotating with {len(cat_vector.records)} labels for feature {key} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)" ) continue add_labels( artifact, records=cat_vector.records, feature=cat_vector.feature, from_curator=True, ) # annotate with inferred schemas aka feature sets if ( artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None ): # Prevent overwriting user-defined schemas that contain slots features = cat_vectors["columns"].records if features is not None: index_feature = artifact.schema.index index_feature_id = None if index_feature is None else index_feature.id feature_set = Schema( features=[ f for f in features if index_feature_id is None or f.id != index_feature_id ], itype=artifact.schema.itype, index=index_feature, minimal_set=artifact.schema.minimal_set, maximal_set=artifact.schema.maximal_set, coerce=artifact.schema.coerce, ordered_set=artifact.schema.ordered_set, ) if ( feature_set._state.adding and len(features) > settings.annotation.n_max_records ): logger.important( f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)" ) itype = ( Feature.name if artifact.schema.itype == "Composite" # backward compat else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"] ) feature_set = Schema(itype=itype, n_members=len(features)) ArtifactSchema.objects.update_or_create( artifact=artifact, slot="columns", defaults={"schema": feature_set.save()}, ) else: for slot, slot_curator in curator._slots.items(): # var_index is backward compat (2025-05-01) name = ( "var_index" if (slot == "var" and "var_index" in slot_curator.cat._cat_vectors) else "columns" ) features = slot_curator.cat._cat_vectors[name].records if features is None: logger.warning(f"no features found for slot {slot}") continue validating_schema = slot_curator._schema index_feature = validating_schema.index index_feature_id = None if index_feature is None else index_feature.id feature_set = Schema( features=[ f for f in features if index_feature_id is None or f.id != index_feature_id ], itype=validating_schema.itype, index=index_feature, minimal_set=validating_schema.minimal_set, maximal_set=validating_schema.maximal_set, coerce=validating_schema.coerce, ordered_set=validating_schema.ordered_set, ) if ( feature_set._state.adding and len(features) > settings.annotation.n_max_records ): logger.important( f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)" ) itype = ( Feature.name if artifact.schema.slots[slot].itype == "Composite" # backward compat else parse_cat_dtype( artifact.schema.slots[slot].itype, is_itype=True )["field"] ) feature_set = Schema(itype=itype, n_members=len(features)) ArtifactSchema.objects.update_or_create( artifact=artifact, slot=slot, defaults={"schema": feature_set.save()} ) slug = ln_setup.settings.instance.slug if ln_setup.settings.instance.is_remote: # pdagma: no cover ui_url = ln_setup.settings.instance.ui_url logger.important(f"go to {ui_url}/{slug}/artifact/{artifact.uid}") return artifact def _flatten_unique(series: pd.Series[list[Any] | Any]) -> list[Any]: """Flatten a Pandas series containing lists or single items into a unique list of elements. The order of elements in the result list preserves the order they first appear in the input series. """ # Use dict.fromkeys to preserve order while ensuring uniqueness result: dict = {} for item in series: if isinstance(item, list | np.ndarray): # Add each element to the dict (only first occurrence is kept) for element in item: result[element] = None else: result[item] = None # Return the keys as a list, preserving order return list(result.keys()) ================================================ FILE: lamindb/errors.py ================================================ """Errors. Django. .. autoexception:: ObjectDoesNotExist .. autoexception:: MultipleObjectsReturned LaminDB. .. autoexception:: ValidationError .. autoexception:: InvalidArgument .. autoexception:: NotebookNotSaved .. autoexception:: UnknownStorageLocation .. autoexception:: MissingContextUID .. autoexception:: UpdateContext .. autoexception:: IntegrityError .. autoexception:: FieldValidationError .. autoexception:: NoWriteAccess .. autoexception:: BlobHashNotFound .. autoexception:: FileNotInDevDir .. autoexception:: BranchAlreadyExists """ # ------------------------------------------------------------------------------------- # Django # ------------------------------------------------------------------------------------- from django.core.exceptions import ( MultipleObjectsReturned, # noqa: F401 ObjectDoesNotExist, # noqa: F401 ) ObjectDoesNotExist.__doc__ = """Object does not exist. This is an alias for `django.core.exceptions.ObjectDoesNotExist`. """ DoesNotExist = ObjectDoesNotExist # backward compat MultipleObjectsReturned.__doc__ = """Multiple objects returned. This is an alias for `django.core.exceptions.MultipleObjectsReturned`. """ MultipleResultsFound = MultipleObjectsReturned # backward compat # ------------------------------------------------------------------------------------- # lamindb # ------------------------------------------------------------------------------------- class ValidationError(Exception): """Validation error.""" pass class InvalidArgument(Exception): """Invalid method or function argument.""" pass class TrackNotCalled(Exception): """`ln.track()` wasn't called.""" pass class NotebookNotSaved(Exception): """Notebook wasn't saved.""" pass class UnknownStorageLocation(Exception): """Path is not contained in any known storage location.""" pass class NoStorageLocationForSpace(Exception): """No storage location found for space.""" pass class InconsistentKey(Exception): """Inconsistent transform or artifact `key`.""" pass class FieldValidationError(Exception): """Field validation error.""" pass # ------------------------------------------------------------------------------------- # run context # ------------------------------------------------------------------------------------- class IntegrityError(Exception): """Integrity error. For instance, it's not allowed to delete artifacts outside managed storage locations. """ pass class MissingContextUID(Exception): """User didn't define transform settings.""" pass class UpdateContext(Exception): """Transform settings require update.""" pass class BlobHashNotFound(Exception): """Blob hash not found in git or storage.""" pass # ------------------------------------------------------------------------------------- # CRUD # ------------------------------------------------------------------------------------- class NoWriteAccess(Exception): """No write access to a space.""" pass class FileNotInDevDir(Exception): """File path is not within the configured dev directory.""" pass class BranchAlreadyExists(Exception): """Branch already exists. Raised when creating a branch with `ln.setup.switch(..., create=True)` and a branch with the given name or uid already exists. Consistent with `git switch -c`. """ pass ================================================ FILE: lamindb/examples/__init__.py ================================================ """Examples. .. autosummary:: :toctree: . schemas datasets cellxgene croissant mlflow wandb """ from . import croissant, datasets, mlflow, schemas, wandb from .cellxgene import _cellxgene ================================================ FILE: lamindb/examples/cellxgene/__init__.py ================================================ """CELLxGENE utilities. .. autofunction:: save_cellxgene_defaults .. autofunction:: create_cellxgene_schema """ from ._cellxgene import ( create_cellxgene_schema, save_cellxgene_defaults, ) ================================================ FILE: lamindb/examples/cellxgene/_cellxgene.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Collection, Literal, NamedTuple if TYPE_CHECKING: from lamindb.base.types import FieldAttr from lamindb.models import Registry, Schema CELLxGENEOrganisms = Literal[ "human", "mouse", "zebra danio", "rhesus macaquedomestic pig", "chimpanzee", "white-tufted-ear marmoset", "sars-2", ] FieldType = Literal["ontology_id", "name"] def save_cellxgene_defaults() -> None: """Save default values of the CELLxGENE schema to the instance. Adds CELLxGENE specific (control) values that are not available in the ontologies: - "normal" Disease - "na" Ethnicity - "unknown" entries for DevelopmentalStage, Phenotype, and CellType - "tissue", "organoid", "primary cell culture", and "cell line" ULabels (tissue_type) - "cell", "nucleus", "na" ULabels (suspension_type) """ import bionty as bt from lamindb.models import ULabel # "normal" in Disease normal = bt.Phenotype.from_source( ontology_id="PATO:0000461", source=bt.Source.get(name="pato", currently_used=True), ) bt.Disease( uid=normal.uid, name=normal.name, ontology_id=normal.ontology_id, description=normal.description, source=normal.source, # not sure ).save() # na, unknown for model, name in zip( [ bt.Ethnicity, bt.Ethnicity, bt.DevelopmentalStage, bt.Phenotype, bt.CellType, ], ["na", "unknown", "unknown", "unknown", "unknown"], ): model(ontology_id=name, name=name, description="From CellxGene schema.").save() # tissue_type tissue_type = ULabel( name="TissueType", is_type=True, description='From CellxGene schema. Is "tissue", "organoid", "primary cell culture", or "cell line".', ).save() for name in ["tissue", "organoid", "primary cell culture", "cell line"]: ULabel(name=name, type=tissue_type, description="From CellxGene schema.").save() # suspension_type suspension_type = ULabel( name="SuspensionType", is_type=True, description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".', ).save() for name in ["cell", "nucleus", "na"]: ULabel( name=name, type=suspension_type, description="From CellxGene schema." ).save() # organisms taxonomy_ids = [ "NCBITaxon:9606", # Homo sapiens (Human) "NCBITaxon:10090", # Mus musculus (House mouse) "NCBITaxon:9544", # Macaca mulatta (Rhesus monkey) "NCBITaxon:9825", # Sus scrofa domesticus (Domestic pig) "NCBITaxon:9598", # Pan troglodytes (Chimpanzee) "NCBITaxon:9483", # Callithrix jacchus (White-tufted-ear marmoset) "NCBITaxon:7955", # Danio rerio (Zebrafish) ] for ontology_id in taxonomy_ids: bt.Organism.from_source( ontology_id=ontology_id, source=bt.Source.get(name="ncbitaxon", currently_used=True), ).save() def create_cellxgene_schema( *, field_types: FieldType | Collection[FieldType] = "ontology_id", spatial_library_id: str | None = None, organism: CELLxGENEOrganisms = "human", ) -> Schema: """Generates a :class:`~lamindb.Schema` for a specific CELLxGENE schema version. Args: field_types: One or several of 'ontology_id', 'name'. organism: The organism of the Schema. library_id: Identifier for the spatial library. Specifying this value enables curation against spatial requirements. """ import bionty as bt from lamindb.models import Feature, Schema, ULabel class CategorySpec(NamedTuple): field: str | FieldAttr | list[Registry] default: str | None needs_organism: bool = False categoricals_to_spec: dict[str, CategorySpec] = { "assay": CategorySpec(bt.ExperimentalFactor.name, None, False), "assay_ontology_term_id": CategorySpec( bt.ExperimentalFactor.ontology_id, None, False ), "cell_type": CategorySpec(bt.CellType.name, "unknown", False), "cell_type_ontology_term_id": CategorySpec( bt.CellType.ontology_id, None, False ), "development_stage": CategorySpec(bt.DevelopmentalStage.name, "unknown", True), "development_stage_ontology_term_id": CategorySpec( bt.DevelopmentalStage.ontology_id, None, True ), "disease": CategorySpec(bt.Disease.name, "normal", False), "disease_ontology_term_id": CategorySpec(bt.Disease.ontology_id, None, False), "self_reported_ethnicity": CategorySpec(bt.Ethnicity.name, "unknown", False), "self_reported_ethnicity_ontology_term_id": CategorySpec( bt.Ethnicity.ontology_id, None, False ), "sex": CategorySpec(bt.Phenotype.name, "unknown", False), "sex_ontology_term_id": CategorySpec(bt.Phenotype.ontology_id, None, False), "suspension_type": CategorySpec(ULabel.name, "cell", False), "tissue": CategorySpec(bt.Tissue.name, None, False), "tissue_ontology_term_id": CategorySpec( [bt.Tissue.ontology_id, bt.CellType.ontology_id], None, False ), "tissue_type": CategorySpec(ULabel.name, "tissue", False), "organism": CategorySpec(bt.Organism.scientific_name, None, False), "organism_ontology_term_id": CategorySpec(bt.Organism.ontology_id, None, False), "donor_id": CategorySpec(str, "unknown", False), } def _get_source_cat_filters( field: str | FieldAttr | type[Registry], *, needs_organism: bool | None = None ) -> dict | None: """Some ontology are organism specific and their Features therefore need a `cat_filter`.""" if isinstance(field, str) or not needs_organism: return None registry = field.field.model if hasattr(field, "field") else field entity = f"bionty.{registry.__name__}" filters = {"entity": entity, "currently_used": True} if needs_organism: filters["organism"] = organism return {"source": bt.Source.filter(**filters).one()} field_types_set = ( {field_types} if isinstance(field_types, str) else set(field_types) ) if field_types_set == {"ontology_id"}: categoricals = { k: v.field for k, v in categoricals_to_spec.items() if k.endswith("_ontology_term_id") or k == "donor_id" } elif field_types_set == {"name"}: categoricals = { k: v.field for k, v in categoricals_to_spec.items() if not k.endswith("_ontology_term_id") and k != "donor_id" } elif field_types_set == {"name", "ontology_id"}: categoricals = {k: v.field for k, v in categoricals_to_spec.items()} else: raise ValueError( f"Invalid field_types: {field_types}. Must contain 'ontology_id', 'name', or both." ) organism_fields = {"organism", "organism_ontology_term_id"} obs_categoricals = { k: v for k, v in categoricals.items() if k not in organism_fields } var_schema = Schema( name="var of CELLxGENE", index=Feature( name="var_index", dtype=bt.Gene.ensembl_gene_id, cat_filters=_get_source_cat_filters( bt.Gene.ensembl_gene_id, needs_organism=True ), ).save(), itype=Feature, features=[Feature(name="feature_is_filtered", dtype=bool).save()], dtype="DataFrame", coerce=True, ).save() obs_features = [] for field in obs_categoricals: if field == "var_index": continue dtype = obs_categoricals[field] needs_organism = categoricals_to_spec[field].needs_organism cat_filters: dict | list[dict] | None if isinstance(dtype, list): cat_filters = ( [ _get_source_cat_filters(d, needs_organism=needs_organism) for d in dtype ] if needs_organism else None ) elif not isinstance(dtype, str): cat_filters = _get_source_cat_filters(dtype, needs_organism=needs_organism) else: cat_filters = None obs_features.append( Feature( # type: ignore name=field, dtype=dtype, default_value=categoricals_to_spec[field].default, cat_filters=cat_filters, # type: ignore ).save() ) for name in ["is_primary_data", "suspension_type", "tissue_type"]: obs_features.append(Feature(name=name, dtype=ULabel.name).save()) obs_schema = Schema( name=f"obs of CELLxGENE of {field_types}", features=obs_features, otype="DataFrame", minimal_set=True, coerce=True, ).save() slots = {"var": var_schema, "obs": obs_schema} uns_categoricals = {k: v for k, v in categoricals.items() if k in organism_fields} uns_features = [ Feature( name=field, dtype=uns_categoricals[field], default_value=categoricals_to_spec[field].default, ).save() for field in uns_categoricals ] uns_schema = Schema( name="uns of CELLxGENE version", features=uns_features, otype="DataFrame", minimal_set=True, coerce=True, ).save() slots["uns"] = uns_schema # Add spatial validation if library_id is provided if spatial_library_id: scalefactors_schema = Schema( name=f"scalefactors of spatial {spatial_library_id}", features=[ Feature(name="spot_diameter_fullres", dtype=float).save(), Feature(name="tissue_hires_scalef", dtype=float).save(), ], ).save() spatial_schema = Schema( name="CELLxGENE spatial metadata", features=[ Feature( name="is_single", dtype=bool, description="True if dataset represents single spatial unit (tissue section for Visium, array for Slide-seqV2)", ).save() ], ).save() slots["uns:spatial"] = spatial_schema slots[f"uns:spatial:{spatial_library_id}:scalefactors"] = scalefactors_schema # Spatial library ID must be in the name # Otherwise, we have lookup side effects where other existing Spatial Library IDs make it into the Schema schema_name = f"CELLxGENE AnnData of {', '.join(field_types) if isinstance(field_types, list) else field_types}" if spatial_library_id: schema_name += f" ({spatial_library_id})" full_cxg_schema = Schema( name=schema_name, otype="AnnData", minimal_set=True, coerce=True, slots=slots, ).save() return full_cxg_schema ================================================ FILE: lamindb/examples/croissant/__init__.py ================================================ """Examples for MLCommons Croissant files, which are used to store metadata about datasets. .. autofunction:: mini_immuno """ import json from pathlib import Path def mini_immuno( n_files: int = 1, filepath_prefix: str = "", strip_version: bool = False ) -> list[Path]: """Return paths to the mini immuno dataset and its metadata as a Croissant file. Args: n_files: Number of files inside the croissant file. filepath_prefix: Move the dataset and references to it in a specific directory. Example :: croissant_path, dataset1_path = ln.examples.croissant.mini_immuno() croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno(n_files=2) """ from ..datasets import file_mini_csv from ..datasets.mini_immuno import get_dataset1 adata = get_dataset1(otype="AnnData") if filepath_prefix: dataset1_path = Path(filepath_prefix) / "mini_immuno.anndata.zarr" else: dataset1_path = Path("mini_immuno.anndata.zarr") adata.write_zarr(dataset1_path) orig_croissant_path = ( Path(__file__).parent / "mini_immuno.anndata.zarr_metadata.json" ) with open(orig_croissant_path, encoding="utf-8") as f: data = json.load(f) if filepath_prefix: assert data["distribution"][0]["@id"] == "mini_immuno.anndata.zarr" # noqa: S101 data["distribution"][0]["@id"] = str(Path(filepath_prefix) / dataset1_path.name) if strip_version: data.pop("version", None) if n_files == 2: file_mini_csv() if filepath_prefix: dataset2_path = Path(filepath_prefix) / "mini.csv" else: dataset2_path = Path("mini.csv") data["distribution"].append( { "@type": "sc:FileObject", "@id": dataset2_path.as_posix(), "name": "mini.csv", "encodingFormat": "text/csv", } ) croissant_path = Path("mini_immuno.anndata.zarr_metadata.json") with open(croissant_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2) result: list[Path] = [croissant_path, dataset1_path] if n_files == 1: return result result.append(dataset2_path) return result ================================================ FILE: lamindb/examples/croissant/mini_immuno.anndata.zarr_metadata.json ================================================ { "@context": { "@vocab": "https://schema.org/", "cr": "https://mlcommons.org/croissant/", "ml": "http://ml-schema.org/", "sc": "https://schema.org/", "dct": "http://purl.org/dc/terms/", "data": "https://mlcommons.org/croissant/data/", "rai": "https://mlcommons.org/croissant/rai/", "format": "https://mlcommons.org/croissant/format/", "citeAs": "https://mlcommons.org/croissant/citeAs/", "conformsTo": "https://mlcommons.org/croissant/conformsTo/", "@language": "en", "repeated": "https://mlcommons.org/croissant/repeated/", "field": "https://mlcommons.org/croissant/field/", "examples": "https://mlcommons.org/croissant/examples/", "recordSet": "https://mlcommons.org/croissant/recordSet/", "fileObject": "https://mlcommons.org/croissant/fileObject/", "fileSet": "https://mlcommons.org/croissant/fileSet/", "source": "https://mlcommons.org/croissant/source/", "references": "https://mlcommons.org/croissant/references/", "key": "https://mlcommons.org/croissant/key/", "parentField": "https://mlcommons.org/croissant/parentField/", "isLiveDataset": "https://mlcommons.org/croissant/isLiveDataset/", "separator": "https://mlcommons.org/croissant/separator/", "extract": "https://mlcommons.org/croissant/extract/", "subField": "https://mlcommons.org/croissant/subField/", "regex": "https://mlcommons.org/croissant/regex/", "column": "https://mlcommons.org/croissant/column/", "path": "https://mlcommons.org/croissant/path/", "fileProperty": "https://mlcommons.org/croissant/fileProperty/", "md5": "https://mlcommons.org/croissant/md5/", "jsonPath": "https://mlcommons.org/croissant/jsonPath/", "transform": "https://mlcommons.org/croissant/transform/", "replace": "https://mlcommons.org/croissant/replace/", "dataType": "https://mlcommons.org/croissant/dataType/", "includes": "https://mlcommons.org/croissant/includes/", "excludes": "https://mlcommons.org/croissant/excludes/" }, "@type": "Dataset", "name": "Mini immuno dataset", "description": "A few samples from the immunology dataset", "url": "https://lamin.ai/laminlabs/lamindata/artifact/tCUkRcaEjTjhtozp0000", "creator": { "@type": "Person", "name": "falexwolf" }, "dateCreated": "2025-07-16", "cr:projectName": "Mini Immuno Project", "datePublished": "2025-07-16", "version": "1.0", "license": "https://creativecommons.org/licenses/by/4.0/", "citation": "Please cite this dataset as: mini immuno (2025)", "encodingFormat": "zarr", "distribution": [ { "@type": "cr:FileSet", "@id": "mini_immuno.anndata.zarr", "containedIn": { "@id": "directory" }, "encodingFormat": "zarr" } ], "cr:recordSet": [ { "@type": "cr:RecordSet", "@id": "#samples", "name": "samples", "description": "my sample" } ] } ================================================ FILE: lamindb/examples/datasets/__init__.py ================================================ """Example datasets. The mini immuno dataset ----------------------- .. autosummary:: :toctree: . mini_immuno Small in-memory datasets ------------------------ .. autofunction:: anndata_with_obs Files ----- .. autofunction:: file_fcs .. autofunction:: file_fcs_alpert19 .. autofunction:: file_tsv_rnaseq_nfcore_salmon_merged_gene_counts .. autofunction:: file_jpg_paradisi05 .. autofunction:: file_tiff_suo22 .. autofunction:: file_fastq .. autofunction:: file_bam .. autofunction:: file_mini_csv Directories ----------- .. autofunction:: dir_scrnaseq_cellranger .. autofunction:: dir_iris_images Dictionary, Dataframe, AnnData, MuData, SpatialData ---------------------------------------------------- .. autofunction:: dict_cellxgene_uns .. autofunction:: df_iris .. autofunction:: df_iris_in_meter .. autofunction:: df_iris_in_meter_study1 .. autofunction:: df_iris_in_meter_study2 .. autofunction:: anndata_mouse_sc_lymph_node .. autofunction:: anndata_human_immune_cells .. autofunction:: anndata_pbmc68k_reduced .. autofunction:: anndata_file_pbmc68k_test .. autofunction:: anndata_pbmc3k_processed .. autofunction:: anndata_suo22_Visium10X .. autofunction:: anndata_visium_mouse_cellxgene .. autofunction:: mudata_papalexi21_subset .. autofunction:: schmidt22_crispra_gws_IFNG .. autofunction:: schmidt22_perturbseq .. autofunction:: spatialdata_blobs Other ----- .. autofunction:: fake_bio_notebook_titles """ import importlib.util import sys from typing import TYPE_CHECKING if TYPE_CHECKING: from . import mini_immuno from ._core import ( anndata_file_pbmc68k_test, anndata_human_immune_cells, anndata_mouse_sc_lymph_node, anndata_pbmc3k_processed, anndata_pbmc68k_reduced, anndata_suo22_Visium10X, anndata_visium_mouse_cellxgene, df_iris, df_iris_in_meter, df_iris_in_meter_study1, df_iris_in_meter_study2, dict_cellxgene_uns, dir_iris_images, dir_scrnaseq_cellranger, file_bam, file_fastq, file_fcs, file_fcs_alpert19, file_jpg_paradisi05, file_mini_csv, file_tiff_suo22, file_tsv_rnaseq_nfcore_salmon_merged_gene_counts, mudata_papalexi21_subset, schmidt22_crispra_gws_IFNG, schmidt22_perturbseq, spatialdata_blobs, ) from ._fake import fake_bio_notebook_titles from ._small import anndata_with_obs, small_dataset3_cellxgene from .mini_immuno import get_dataset1 as small_dataset1 from .mini_immuno import get_dataset2 as small_dataset2 def __getattr__(name: str): """Lazy-import datasets to avoid loading pandas/anndata at package import.""" if name == "mini_immuno": # Use importlib to avoid __getattr__ recursion when importing submodule spec = importlib.util.find_spec( "lamindb.examples.datasets.mini_immuno", package="lamindb.examples.datasets", ) if spec is None or spec.loader is None: raise ImportError("Could not find module mini_immuno") module = importlib.util.module_from_spec(spec) sys.modules["lamindb.examples.datasets.mini_immuno"] = module spec.loader.exec_module(module) return module if name in ("small_dataset1", "small_dataset2"): mini_immuno = importlib.import_module( ".mini_immuno", package="lamindb.examples.datasets" ) return ( mini_immuno.get_dataset1 if name == "small_dataset1" else mini_immuno.get_dataset2 ) _core_names = ( "anndata_file_pbmc68k_test", "anndata_human_immune_cells", "anndata_mouse_sc_lymph_node", "anndata_pbmc3k_processed", "anndata_pbmc68k_reduced", "anndata_suo22_Visium10X", "df_iris", "df_iris_in_meter", "df_iris_in_meter_study1", "df_iris_in_meter_study2", "dict_cellxgene_uns", "dir_iris_images", "dir_scrnaseq_cellranger", "file_bam", "file_fastq", "file_fcs", "file_fcs_alpert19", "file_jpg_paradisi05", "file_mini_csv", "file_tiff_suo22", "file_tsv_rnaseq_nfcore_salmon_merged_gene_counts", "mudata_papalexi21_subset", "schmidt22_crispra_gws_IFNG", "schmidt22_perturbseq", "spatialdata_blobs", "anndata_visium_mouse_cellxgene", ) if name in _core_names: _core = importlib.import_module("._core", package="lamindb.examples.datasets") return getattr(_core, name) if name in ("anndata_with_obs", "small_dataset3_cellxgene"): _small = importlib.import_module("._small", package="lamindb.examples.datasets") return getattr(_small, name) if name == "fake_bio_notebook_titles": _fake = importlib.import_module("._fake", package="lamindb.examples.datasets") return _fake.fake_bio_notebook_titles raise AttributeError(f"module {__name__!r} has no attribute {name!r}") __all__ = [ "mini_immuno", "small_dataset1", "small_dataset2", "small_dataset3_cellxgene", "anndata_with_obs", "anndata_file_pbmc68k_test", "anndata_human_immune_cells", "anndata_mouse_sc_lymph_node", "anndata_pbmc3k_processed", "anndata_pbmc68k_reduced", "anndata_suo22_Visium10X", "anndata_visium_mouse_cellxgene", "df_iris", "df_iris_in_meter", "df_iris_in_meter_study1", "df_iris_in_meter_study2", "dict_cellxgene_uns", "dir_iris_images", "dir_scrnaseq_cellranger", "fake_bio_notebook_titles", "file_bam", "file_fastq", "file_fcs", "file_fcs_alpert19", "file_jpg_paradisi05", "file_mini_csv", "file_tiff_suo22", "file_tsv_rnaseq_nfcore_salmon_merged_gene_counts", "mudata_papalexi21_subset", "schmidt22_crispra_gws_IFNG", "schmidt22_perturbseq", "spatialdata_blobs", ] ================================================ FILE: lamindb/examples/datasets/_core.py ================================================ from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Any from urllib.request import urlretrieve import anndata as ad import pandas as pd from upath import UPath from lamindb.base.uids import base62 from lamindb.core._settings import settings if TYPE_CHECKING: from mudata import MuData from spatialdata import SpatialData def file_fcs() -> Path: """Example FCS artifact.""" filepath, _ = urlretrieve( "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs", "example.fcs", ) return Path(filepath) def file_fcs_alpert19(populate_registries: bool = False) -> Path: """FCS file from Alpert19. Args: populate_registries: pre-populate metadata records to simulate existing registries # noqa """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/Alpert19-070314-Mike-Study+15-2013-plate+1-15-004-1-13_cells_found.fcs", "Alpert19.fcs", ) if populate_registries: import bionty as bt import readfcs import lamindb as ln verbosity = ln.settings.verbosity ln.settings.verbosity = "error" adata = readfcs.read(filepath) std = bt.CellMarker.public().standardize(adata.var.index) ln.save( bt.CellMarker.from_values( bt.CellMarker.public().inspect(std, "name").validated, "name" ) ) ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore ln.settings.verbosity = verbosity return Path(filepath) def file_jpg_paradisi05() -> Path: """JPG file example. Originally from: https://upload.wikimedia.org/wikipedia/commons/2/28/Laminopathic_nuclei.jpg """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/Laminopathic_nuclei.jpg", "paradisi05_laminopathic_nuclei.jpg", ) return Path(filepath) def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts( populate_registries: bool = False, ) -> Path: """Gene counts table from nf-core RNA-seq pipeline. Output of: https://nf-co.re/rnaseq """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/salmon.merged.gene_counts.tsv", "salmon.merged.gene_counts.tsv", ) if populate_registries: import bionty as bt import lamindb as ln verbosity = ln.settings.verbosity ln.settings.verbosity = "error" ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save() ln.settings.verbosity = verbosity return Path(filepath) def file_fastq(in_storage_root=False) -> Path: """Mini mock fastq artifact.""" basedir = Path() if not in_storage_root else settings.storage.root filepath = basedir / "input.fastq.gz" with open(filepath, "w") as f: f.write("Mock fastq artifact.") return filepath def file_bam(in_storage_root=False) -> Path: """Mini mock bam artifact.""" basedir = Path() if not in_storage_root else settings.storage.root filepath = basedir / "output.bam" with open(filepath, "w") as f: f.write("Mock bam artifact.") return filepath def file_mini_csv(in_storage_root=False) -> Path: """Mini csv artifact.""" basedir = Path() if not in_storage_root else settings.storage.root filepath = basedir / "mini.csv" df = pd.DataFrame([1, 2, 3], columns=["test"]) df.to_csv(filepath, index=False) return filepath def file_tiff_suo22() -> Path: """Image file from Suo22. Pair with anndata_suo22_Visium10X """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/F121_LP1_4LIV.tiff", "F121_LP1_4LIV.tiff", ) Path("suo22/").mkdir(exist_ok=True) filepath = Path(filepath).rename("suo22/F121_LP1_4LIV.tiff") # type: ignore return Path(filepath) def dir_iris_images() -> UPath: """Directory with 3 studies of the Iris flower: 405 images & metadata. Provenance: https://lamin.ai/laminlabs/lamindata/transform/3q4MpQxRL2qZ5zKv The problem is that the same artifact was also ingested by the downstream demo notebook: https://lamin.ai/laminlabs/lamindata/transform/NJvdsWWbJlZS5zKv This is why on the UI, the artifact shows up as output of the downstream demo notebook rather than the upstream curation notebook. The lineage information should still be captured by https://github.com/laminlabs/lnschema-core/blob/a90437e91dfbd6b9002f18c3e978bd0f9c9a632d/lamindb/models.py#L2050-L2052 but we don't use this in the UI yet. """ return UPath("s3://lamindata/iris_studies") def anndata_mouse_sc_lymph_node( populate_registries: bool = False, ) -> ad.AnnData: """Mouse lymph node scRNA-seq collection from EBI. Subsampled to 10k genes. From: https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-8414/ Args: populate_registries: pre-populate metadata records to simulate existing registries # noqa """ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/E-MTAB-8414.h5ad") adata = ad.read_h5ad(filepath) # The column names are a bit lengthy, let's abbreviate them: adata.obs.columns = ( adata.obs.columns.str.replace("Sample Characteristic", "") .str.replace("Factor Value ", "Factor Value:", regex=True) .str.replace("Factor Value\\[", "Factor Value:", regex=True) .str.replace(" Ontology Term\\[", "ontology_id:", regex=True) .str.strip("[]") .str.replace("organism part", "tissue") .str.replace("organism", "organism") .str.replace("developmental stage", "developmental_stage") .str.replace("cell type", "cell_type") # the last one could be interesting, too # .str.replace("Factor Value:Ontology Term[inferred cell_type - authors labels", "cell_type_authors") ) # subset columns to only the ones with names columns = [ col for col in adata.obs.columns if not col.startswith("ontology_id") and not col.startswith("Factor Value") and col != "strain" ] adata.obs = adata.obs[columns] # pre-populate registries if populate_registries: import bionty as bt import lamindb as ln verbosity = ln.settings.verbosity ln.settings.verbosity = "error" # strain bt.ExperimentalFactor.from_source(ontology_id="EFO:0004472").save() # developmental stage bt.ExperimentalFactor.from_source(ontology_id="EFO:0001272").save() # tissue bt.Tissue.from_source(ontology_id="UBERON:0001542").save() # cell types ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id")) # assays ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save() # genes validated = bt.Gene.public(organism="mouse").validate( adata.var.index, field="ensembl_gene_id" ) ln.save( bt.Gene.from_values( adata.var.index[validated][:-19], field="ensembl_gene_id", organism="mouse", ) ) # labels labels = [] for col in ["sex", "age", "genotype", "immunophenotype"]: labels += [ln.ULabel(name=name) for name in adata.obs[col]] ln.save(labels) ln.settings.verbosity = verbosity return adata def anndata_pbmc68k_reduced() -> ad.AnnData: """Modified from scanpy.collections.pbmc68k_reduced(). This code was run:: pbmc68k = sc.collections.pbmc68k_reduced() pbmc68k.obs.rename(columns={"bulk_labels": "cell_type"}, inplace=True) pbmc68k.obs["cell_type"] = pbmc68k.obs["cell_type"].cat.rename_categories( {"Dendritic": "Dendritic cells", "CD14+ Monocyte": "CD14+ Monocytes"} ) del pbmc68k.obs["G2M_score"] del pbmc68k.obs["S_score"] del pbmc68k.obs["phase"] del pbmc68k.obs["n_counts"] del pbmc68k.var["dispersions"] del pbmc68k.var["dispersions_norm"] del pbmc68k.var["means"] del pbmc68k.uns["rank_genes_groups"] del pbmc68k.uns["bulk_labels_colors"] sc.pp.subsample(pbmc68k, fraction=0.1, random_state=123) pbmc68k.write("scrnaseq_pbmc68k_tiny.h5ad") """ filepath, _ = urlretrieve( "https://lamindb-dev-datasets.s3.amazonaws.com/scrnaseq_pbmc68k_tiny.h5ad" ) return ad.read_h5ad(filepath) def anndata_file_pbmc68k_test() -> Path: """Modified from scanpy.collections.pbmc68k_reduced(). Additional slots were added for testing purposes. Returns the filepath. To reproduce:: pbmc68k = ln.examples.datasets.anndata_pbmc68k_reduced() pbmc68k_test = pbmc68k[:30, :200].copy() pbmc68k_test.raw = pbmc68k_test[:, :100] pbmc68k_test.obsp["test"] = sparse.eye(pbmc68k_test.shape[0], format="csr") pbmc68k_test.varp["test"] = sparse.eye(pbmc68k_test.shape[1], format="csr") pbmc68k_test.layers["test"] = sparse.csr_matrix(pbmc68k_test.shape) pbmc68k_test.layers["test"][0] = 1. pbmc68k_test.write("pbmc68k_test.h5ad") """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/pbmc68k_test.h5ad", "pbmc68k_test.h5ad" ) return Path(filepath) def anndata_pbmc3k_processed() -> ad.AnnData: """Modified from scanpy.pbmc3k_processed().""" filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/scrnaseq_scanpy_pbmc3k_processed.h5ad" ) pbmc3k = ad.read_h5ad(filepath) pbmc3k.obs.rename(columns={"louvain": "cell_type"}, inplace=True) return pbmc3k def anndata_human_immune_cells( populate_registries: bool = False, ) -> ad.AnnData: """Cross-tissue immune cell analysis reveals tissue-specific features in humans. From: https://cellxgene.cziscience.com/collections/62ef75e4-cbea-454e-a0ce-998ec40223d3 Collection: Global To reproduce the subsample:: >>> adata = sc.read('Global.h5ad') >>> adata.obs = adata.obs[['donor_id', 'tissue', 'cell_type', 'assay', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id']].copy() >>> sc.pp.subsample(adata, fraction=0.005) >>> del adata.uns["development_cache_ontology_term_id_colors"] >>> del adata.uns["sex_ontology_term_id_colors"] >>> adata.write('human_immune.h5ad') """ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/human_immune.h5ad") adata = ad.read_h5ad(filepath) adata.var.drop(columns=["gene_symbols", "feature_name"], inplace=True) adata.uns.pop("cell_type_ontology_term_id_colors") adata.uns.pop("title") adata.uns.pop("schema_version") adata.obs.columns = adata.obs.columns.str.replace("donor_id", "donor") columns = [col for col in adata.obs.columns if "ontology_term" not in col] adata.obs = adata.obs[columns] if populate_registries: import bionty as bt import lamindb as ln ln.save( bt.Gene.from_values( adata.var.index, field="ensembl_gene_id", organism="human" ) ) ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name")) ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name")) ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name")) ln.Feature(name="cell_type", dtype=[bt.CellType]).save() # type: ignore ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save() # type: ignore ln.Feature(name="tissue", dtype=[bt.Tissue]).save() # type: ignore ln.Feature(name="organism", dtype=[bt.Organism]).save() # type: ignore ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save() ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()]) return adata def anndata_suo22_Visium10X(): """AnnData from Suo22 generated by 10x Visium.""" import anndata as ad filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/suo22_Visium10X_data_LI_subset.h5ad", "Visium10X_data_LI_subset.h5ad", ) Path("suo22/").mkdir(exist_ok=True) filepath = Path(filepath).rename("suo22/Visium10X_data_LI_subset.h5ad") return ad.read_h5ad(filepath) def mudata_papalexi21_subset(with_uns: bool = False) -> MuData: """A subsetted MuData from papalexi21. To reproduce the subsetting: >>> !wget https://figshare.com/ndownloader/files/36509460 >>> import mudata as md >>> import scanpy as sc >>> mdata = md.read_h5mu("36509460") >>> mdata = sc.pp.subsample(mdata, n_obs=200, copy=True)[0] >>> mdata[:, -300:].copy().write("papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu") """ import mudata as md md.set_options(pull_on_update=False) filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/papalexi21_subset_200x300_lamindb_demo_2023-07-25.h5mu", "papalexi21_subset.h5mu", ) mdata = md.read_h5mu(filepath) mdata.pull_obs() # The MuData object is malformed with duplicated information # Drop all columns for the modalities and add them again correspondingly for mod in ["rna", "adt", "hto", "gdo"]: mdata[mod].obs.drop(mdata[mod].obs.columns, axis=1, inplace=True) for col in mdata.obs.columns: for mod in ["rna", "adt", "hto", "gdo"]: if col.endswith(f"_{mod.upper()}"): new_col = col.replace(f"{mod}:", "") if new_col != col: mdata[mod].obs[new_col] = mdata.obs.pop(col) else: new_col = col.replace(f"{mod}:", "") if new_col not in mdata.obs.columns and col in mdata.obs.columns: mdata.obs[new_col] = mdata.obs.pop(col) for col in mdata.obs.columns: for mod in ["rna", "adt", "hto", "gdo"]: if col.endswith(f"_{mod.upper()}"): del mdata.obs[col] for col in [ "orig.ident", "MULTI_ID", "NT", "S.Score", "G2M.Score", "Phase", "gene_target", "guide_ID", "HTO_classification", ]: del mdata.obs[col] mdata.push_obs(["percent.mito"], mods=["rna"], drop=True) mdata["hto"].obs["technique"] = "cell hashing" mdata["hto"].obs["technique"] = mdata["hto"].obs["technique"].astype("category") mdata.pull_obs(["technique"], mods="hto") if with_uns: mdata.uns["study_metadata"] = { "temperature": 21.6, "experiment": "Experiment 1", } mdata["rna"].uns["site_metadata"] = {"pos": 99.9, "site_id": "SITE001"} return mdata def dict_cellxgene_uns() -> dict[str, Any]: """An example CELLxGENE AnnData `.uns` dictionary.""" uns = { "organism_ontology_term_id": "NCBITaxon:9606", "spatial": { "is_single": True, "library_1": { # Dynamic library_id key "images": { "fullres": "path/to/fullres.jpg", "hires": "path/to/hires.jpg", }, "scalefactors": { "spot_diameter_fullres": 89.43, "tissue_hires_scalef": 0.177, }, }, "library_2": { # Another dynamic library_id key "images": { "fullres": "path/to/fullres_2.jpg", "hires": "path/to/hires_2.jpg", }, "scalefactors": { "spot_diameter_fullres": 120.34, "tissue_hires_scalef": 0.355, }, }, }, } return uns def df_iris() -> pd.DataFrame: """The iris collection as in sklearn. Original code:: sklearn.collections.load_iris(as_frame=True).frame """ filepath, _ = urlretrieve("https://lamindb-test.s3.amazonaws.com/iris.parquet") return pd.read_parquet(filepath) def df_iris_in_meter() -> pd.DataFrame: """The iris collection with lengths in meter.""" df = df_iris() # rename columns df.rename( columns={ "sepal length (cm)": "sepal_length", "sepal width (cm)": "sepal_width", "petal length (cm)": "petal_length", "petal width (cm)": "petal_width", }, inplace=True, ) df[["sepal_length", "sepal_width", "petal_length", "petal_width"]] /= 100 df["iris_organism_name"] = df["target"].map( {0: "setosa", 1: "versicolor", 2: "virginica"} ) del df["target"] return df def df_iris_in_meter_study1() -> pd.DataFrame: """The iris collection with lengths in meter.""" df_iris = df_iris_in_meter() return df_iris.iloc[: len(df_iris) // 2] def df_iris_in_meter_study2() -> pd.DataFrame: """The iris collection with lengths in meter.""" df_iris = df_iris_in_meter() return df_iris.iloc[len(df_iris) // 2 :] def dir_scrnaseq_cellranger( sample_name: str, basedir: str | Path = "./", output_only: bool = True ) -> Path: """Mock cell ranger outputs. Args: sample_name: name of the sample basedir: run directory output_only: only return output files """ basedir = Path(basedir) if not output_only: fastqdir = basedir / "fastq" fastqdir.mkdir(parents=True, exist_ok=True) fastqfile1 = fastqdir / f"{sample_name}_R1_001.fastq.gz" with open(fastqfile1, "w") as f: f.write(f"{base62(n_char=6)}") fastqfile2 = fastqdir / f"{sample_name}_R2_001.fastq.gz" fastqfile2.touch(exist_ok=True) with open(fastqfile2, "w") as f: f.write(f"{base62(n_char=6)}") sampledir = basedir / f"{sample_name}" for folder in ["raw_feature_bc_matrix", "filtered_feature_bc_matrix", "analysis"]: filedir = sampledir / folder filedir.mkdir(parents=True, exist_ok=True) for filename in [ "web_summary.html", "metrics_summary.csv", "possorted_genome_bam.bam", "possorted_genome_bam.bam.bai", "molecule_info.h5", "cloupe.cloupe", "raw_feature_bc_matrix.h5", "raw_feature_bc_matrix/barcodes.tsv.gz", "raw_feature_bc_matrix/features.tsv.gz", "raw_feature_bc_matrix/matrix.mtx.gz", "filtered_feature_bc_matrix.h5", "filtered_feature_bc_matrix/barcodes.tsv.gz", "filtered_feature_bc_matrix/features.tsv.gz", "filtered_feature_bc_matrix/matrix.mtx.gz", "analysis/analysis.csv", ]: file = sampledir / filename with open(file, "w") as f: f.write(f"{base62(n_char=6)}") return sampledir def schmidt22_crispra_gws_IFNG(basedir=".") -> Path: """CRISPRi screen collection of Schmidt22. Originally from: https://zenodo.org/record/5784651 """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/schmidt22-crispra-gws-IFNG.csv", "schmidt22-crispra-gws-IFNG.csv", ) return Path(filepath).rename(Path(basedir) / filepath) def schmidt22_perturbseq(basedir=".") -> Path: """Perturb-seq collection of Schmidt22. Subsampled and converted to h5ad from R file: https://zenodo.org/record/5784651 To reproduce the subsample: >>> adata = sc.read('HuTcellsCRISPRaPerturbSeq_Re-stimulated.h5ad') >>> adata.obs = adata.obs[['cluster_name']] >>> del adata.obsp >>> del adata.var['features'] >>> del adata.obsm['X_pca'] >>> del adata.uns >>> del adata.raw >>> del adata.varm >>> adata.obs = adata.obs.reset_index() >>> del adata.obs['index'] >>> sc.pp.subsample(adata, 0.03) >>> adata.write('schmidt22_perturbseq.h5ad') """ filepath, _ = urlretrieve( "https://lamindb-test.s3.amazonaws.com/schmidt22_perturbseq.h5ad", "schmidt22_perturbseq.h5ad", ) return Path(filepath).rename(Path(basedir) / filepath) def anndata_visium_mouse_cellxgene() -> ad.AnnData: """Visium samples of thymus from wild type B6 mice 3-6 weeks old. The dataset is a CELLxGENE schema 7.0.0 validated dataset. """ filepath, _ = urlretrieve( "https://datasets.cellxgene.cziscience.com/74f5c380-081f-41e4-9f05-346831fb67e8.h5ad", "zhang_2024_pcw56_visium.h5ad", ) return ad.read_h5ad(filepath) def spatialdata_blobs() -> SpatialData: """Example SpatialData dataset for tutorials.""" from spatialdata.datasets import blobs sdata = blobs() sdata.attrs["bio"] = { "disease": "Alzheimer disease", "developmental_stage": "adult stage", } sdata.attrs["tech"] = { "assay": "Visium Spatial Gene Expression", } sdata.attrs["random_int"] = 20 sdata.tables["table"].var.index = [ "ENSG00000139618", # BRCA2 "ENSG00000157764", # BRAF "ENSG00000999999", # Does not exist ] sdata.tables["table"].obs["sample_region"] = pd.Categorical( ["sample region 1"] * 13 + ["sample region 2"] * 13 ) return sdata ================================================ FILE: lamindb/examples/datasets/_fake.py ================================================ from __future__ import annotations def fake_bio_notebook_titles(n=100) -> list[str]: """A fake collection of study titles.""" from faker import Faker fake = Faker() from faker_biology.mol_biol import Antibody from faker_biology.physiology import CellType, Organ, Organelle fake.add_provider(CellType) fake.add_provider(Organ) fake.add_provider(Organelle) fake.add_provider(Antibody) my_words = [ "study", "investigate", "research", "result", "cluster", "rank", "candidate", "visualize", "efficiency", "classify", ] my_words += [fake.organ() for i in range(5)] + ["intestine", "intestinal"] my_words += [fake.celltype() for i in range(10)] my_words += [fake.antibody_isotype() for i in range(20)] my_notebook_titles = [fake.sentence(ext_word_list=my_words) for i in range(n)] return my_notebook_titles ================================================ FILE: lamindb/examples/datasets/_small.py ================================================ from __future__ import annotations from typing import Any, Literal import anndata as ad import numpy as np import pandas as pd def small_dataset3_cellxgene( otype: Literal["DataFrame", "AnnData"] = "AnnData", *, with_obs_defaults: bool = False, with_var_typo: bool = False, with_obs_typo: bool = False, with_uns_organism: bool = False, with_uns_spatial: bool = False, ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData: var_id = "invalid_ensembl_id" if with_var_typo else "ENSG00000000457" var_ids = [var_id, "ENSG00000000419", "ENSG00000139618"] lung_id = "UBERON:0002048XXX" if with_obs_typo else "UBERON:0002048" obs_data = { "disease_ontology_term_id": [ "MONDO:0004975", "MONDO:0004980", "MONDO:0004980", ], "development_stage_ontology_term_id": ["unknown", "unknown", "unknown"], "sex_ontology_term_id": ["PATO:0000383", "PATO:0000384", "unknown"], "tissue_ontology_term_id": [lung_id, lung_id, "UBERON:0000948"], "cell_type": ["T cell", "B cell", "B cell"], "self_reported_ethnicity": ["South Asian", "South Asian", "South Asian"], "donor_id": ["-1", "1", "2"], "is_primary_data": [False, False, False], "suspension_type": ["cell", "cell", "cell"], "tissue_type": ["tissue", "tissue", "tissue"], } obs_df = pd.DataFrame( obs_data, index=["barcode1", "barcode2", "barcode3"], ) var_df = pd.DataFrame( index=var_ids, data={"feature_is_filtered": [False, False, False]} ) X = pd.DataFrame( { var_ids[0]: [2, 3, 3], var_ids[1]: [3, 4, 5], var_ids[2]: [4, 2, 3], }, index=["barcode1", "barcode2", "barcode3"], dtype="float32", ) obs_df["donor_id"] = obs_df["donor_id"].astype("category") if otype == "DataFrame": return pd.concat([X, obs_df], axis=1) else: adata = ad.AnnData(X=X, obs=obs_df, var=var_df) adata.uns["title"] = "CELLxGENE example" adata.obsm["X_pca"] = np.array( [[-1.2, 0.8], [0.5, -0.3], [0.7, -0.5]], dtype="float32" ) # CELLxGENE requires the `.raw` slot to be set - https://github.com/chanzuckerberg/single-cell-curation/issues/1304 adata.raw = adata.copy() adata.raw.var.drop(columns="feature_is_filtered", inplace=True) if with_obs_defaults: adata.obs["cell_type_ontology_term_id"] = [ "CL:0000084", "CL:0000236", "CL:0000236", ] adata.obs["self_reported_ethnicity_ontology_term_id"] = "na" adata.obs["assay_ontology_term_id"] = "EFO:1001982" adata.obs["assay"] = "single-cell RNA sequencing" if with_uns_organism: adata.uns["organism_ontology_term_id"] = "NCBITaxon:9606" adata.uns["organism"] = "Homo sapiens" else: adata.obs["organism_ontology_term_id"] = "NCBITaxon:9606" obs_data["organism"] = ["Homo sapiens", "Homo sapiens", "Homo sapiens"] if with_uns_spatial: adata.uns["spatial"] = { "is_single": True, "library_123": { "scalefactors": { "spot_diameter_fullres": 165.0, "tissue_hires_scalef": 0.5, }, "images": { "hires": np.random.default_rng().integers( 0, 255, (2000, 2000, 3), dtype=np.uint8 ) }, }, } return adata def anndata_with_obs() -> ad.AnnData: """Create a mini anndata with cell_type, disease and tissue.""" import anndata as ad import bionty.base as bionty_base celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"] celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""] diseases = [ "chronic kidney disease", "liver lymphoma", "cardiac ventricle disorder", "Alzheimer disease", ] tissues = ["kidney", "liver", "heart", "brain"] df = pd.DataFrame() df["cell_type"] = celltypes * 10 df["cell_type_id"] = celltype_ids * 10 df["tissue"] = tissues * 10 df["disease"] = diseases * 10 df.index = "obs" + df.index.astype(str) adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df) bionty_genes = bionty_base.Gene() # backwards compatible adata.var.index = ( ( bionty_genes.to_dataframe() if hasattr(bionty_genes, "to_dataframe") else bionty_genes.df() ) .head(100)["ensembl_gene_id"] .values ) return adata ================================================ FILE: lamindb/examples/datasets/define_mini_immuno_features_labels.py ================================================ import bionty as bt import lamindb as ln # define valid labels perturbation_type = ln.Record(name="Perturbation", is_type=True).save() ln.Record(name="DMSO", type=perturbation_type).save() ln.Record(name="IFNG", type=perturbation_type).save() bt.CellType.from_source(name="B cell").save() bt.CellType.from_source(name="T cell").save() # define valid features ln.Feature(name="perturbation", dtype=perturbation_type).save() ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save() ln.Feature(name="assay_oid", dtype=bt.ExperimentalFactor.ontology_id).save() ln.Feature(name="concentration", dtype=str).save() ln.Feature(name="treatment_time_h", dtype="num", coerce=True).save() ln.Feature(name="donor", dtype=str, nullable=True).save() ln.Feature(name="donor_ethnicity", dtype=list[bt.Ethnicity]).save() ================================================ FILE: lamindb/examples/datasets/define_mini_immuno_schema_flexible.py ================================================ import lamindb as ln schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="cell_type_by_model"), ln.Feature.get(name="assay_oid"), ln.Feature.get(name="donor"), ln.Feature.get(name="concentration"), ln.Feature.get(name="treatment_time_h"), ], flexible=True, # _additional_ columns in a dataframe are validated & annotated ).save() ================================================ FILE: lamindb/examples/datasets/mini_immuno.py ================================================ """Two "mini immuno" datasets. Datasets -------- .. autofunction:: get_dataset1 .. autofunction:: get_dataset2 Schemas ------- .. autofunction:: define_features_labels .. autofunction:: define_mini_immuno_schema_flexible Utilities --------- .. autofunction:: save_mini_immuno_datasets """ from __future__ import annotations from datetime import date from typing import TYPE_CHECKING, Literal import anndata as ad import pandas as pd if TYPE_CHECKING: from lamindb.models import Schema def define_features_labels() -> None: """Features & labels to validate the mini immuno datasets. .. literalinclude:: scripts/define_mini_immuno_features_labels.py :language: python """ from . import define_mini_immuno_features_labels # noqa def define_mini_immuno_schema_flexible() -> Schema: """Features & labels to validate the mini immuno datasets. .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py :language: python """ from lamindb.models import Schema define_features_labels() from . import define_mini_immuno_schema_flexible # noqa return Schema.get(name="Mini immuno schema") def save_mini_immuno_datasets(): """Save the two "mini immuno" datasets. .. literalinclude:: scripts/save_mini_immuno_datasets.py :language: python """ from . import save_mini_immuno_datasets # noqa def get_dataset1( otype: Literal["DataFrame", "AnnData"] = "DataFrame", gene_symbols_in_index: bool = False, with_typo: bool = False, with_cell_type_synonym: bool = False, with_cell_type_typo: bool = False, with_gene_typo: bool = False, with_outdated_gene: bool = False, with_wrong_subtype: bool = False, with_index_type_mismatch: bool = False, with_date_as_iso_string: bool = True, ) -> pd.DataFrame | ad.AnnData: """A small tabular dataset measuring expression & metadata.""" # define the data in the dataset # it's a mix of numerical measurements and observation-level metadata ifng = "IFNJ" if with_typo else "IFNG" thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO" if gene_symbols_in_index: var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"] else: var_ids = [ "ENSG00000153563", "ENSG00000010610", "ENSG00000170458" if not with_gene_typo else "GeneTypo" if not with_outdated_gene else "ENSG00000278198", ] abt_cell = ( "CD8-pos alpha-beta T cell" if with_cell_type_typo else "CD8-positive, alpha-beta T cell" ) dataset_dict = { var_ids[0]: [1, 2, 3], var_ids[1]: [3, 4, 5], var_ids[2]: [5, 6, 7], "perturbation": pd.Categorical(["DMSO", ifng, thing]), "sample_note": ["was ok", "looks naah", "pretty! 🤩"], "cell_type_by_expert": pd.Categorical( ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell] ), "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]), "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]), "concentration": ["0.1%", "200 nM", "0.1%"], "treatment_time_h": [24, 24, 6], "donor": ["D0001", "D0002", None], "donor_ethnicity": [ ["Chinese", "Singaporean Chinese"], ["Chinese", "Han Chinese"], ["Chinese"], ], } # define the dataset-level metadata metadata = { "temperature": 21.6, "experiment": "Experiment 1", "date_of_study": "2024-12-01" if with_date_as_iso_string else date(2024, 12, 1), "study_note": "We had a great time performing this study and the results look compelling.", } # the dataset as DataFrame dataset_df = pd.DataFrame( dataset_dict, index=["sample1", "sample2", 0] # type: ignore if with_index_type_mismatch else ["sample1", "sample2", "sample3"], ) if otype == "DataFrame": for key, value in metadata.items(): dataset_df.attrs[key] = value return dataset_df else: del dataset_df[ "donor_ethnicity" ] # remove the donor_ethnicity because AnnData save will error dataset_ad = ad.AnnData( dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata ) return dataset_ad def get_dataset2( otype: Literal["DataFrame", "AnnData"] = "DataFrame", gene_symbols_in_index: bool = False, with_date_as_iso_string: bool = True, ) -> pd.DataFrame | ad.AnnData: """A second small tabular dataset measuring expression & metadata.""" if gene_symbols_in_index: var_ids = ["CD8A", "CD4", "CD38"] else: var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"] dataset_dict = { var_ids[0]: [2, 3, 3], var_ids[1]: [3, 4, 5], var_ids[2]: [4, 2, 3], "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]), "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]), "concentration": ["0.1%", "200 nM", "0.1%"], "treatment_time_h": [24, 24, 6], "donor": ["D0003", "D0003", "D0004"], } metadata = { "temperature": 22.6, "experiment": "Experiment 2", "date_of_study": "2025-02-13" if with_date_as_iso_string else date(2025, 2, 13), } dataset_df = pd.DataFrame( dataset_dict, index=["sample4", "sample5", "sample6"], ) ad.AnnData( dataset_df[var_ids], obs=dataset_df[["perturbation", "cell_type_by_model"]], ) if otype == "DataFrame": for key, value in metadata.items(): dataset_df.attrs[key] = value return dataset_df else: dataset_ad = ad.AnnData( dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata ) return dataset_ad ================================================ FILE: lamindb/examples/datasets/save_mini_immuno_datasets.py ================================================ from datetime import date import bionty as bt import lamindb as ln ## define valid labels ln.Record.from_values(["DMSO", "IFNG"], create=True).save() ln.Record.from_values(["Experiment 1", "Experiment 2"], create=True).save() bt.CellType.from_values(["B cell", "T cell"]).save() # observation-level metadata ln.Feature(name="perturbation", dtype=ln.Record).save() ln.Feature(name="sample_note", dtype=str).save() ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save() # dataset-level metadata ln.Feature(name="temperature", dtype=float).save() ln.Feature(name="experiment", dtype=ln.Record).save() ln.Feature(name="date_of_study", dtype=date, coerce=True).save() ln.Feature(name="study_note", dtype=str).save() ln.Feature(name="study_metadata", dtype=dict).save() schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs() ## Ingest dataset1 adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=schema, ).save() adhoc = {"study_metadata": {"detail1": "123", "detail2": 1}} dataset_metadata = adata.uns dataset_metadata.update(adhoc) artifact.features.add_values(dataset_metadata) # type: ignore # Ingest dataset2 adata2 = ln.examples.datasets.mini_immuno.get_dataset2(otype="AnnData") artifact2 = ln.Artifact.from_anndata( adata2, key="examples/dataset2.h5ad", schema=schema, ).save() adhoc2 = {"study_metadata": {"detail1": "456", "detail2": 2}} dataset_metadata2 = adata2.uns dataset_metadata2.update(adhoc2) artifact2.features.add_values(dataset_metadata2) # type: ignore ================================================ FILE: lamindb/examples/fixtures/__init__.py ================================================ ================================================ FILE: lamindb/examples/fixtures/sheets.py ================================================ import bionty as bt import pandas as pd import pytest import lamindb as ln @pytest.fixture(scope="module") def populate_sheets_compound_treatment(): # Compounds --------------------------- compound_type = ln.Record(name="Compound", is_type=True).save() # features for compounds structure = ln.Feature(name="structure", dtype="str").save() # drug1 drug1 = ln.Record(name="drug1", type=compound_type).save() ln.models.RecordJson(record=drug1, feature=structure, value="12345").save() # drug2 drug2 = ln.Record(name="drug2", type=compound_type).save() ln.models.RecordJson(record=drug2, feature=structure, value="45678").save() # Treatments --------------------------- treatment_type = ln.Record(name="Treatment", is_type=True).save() # features for treatments compound = ln.Feature(name="compound", dtype=compound_type).save() concentration = ln.Feature(name="concentration", dtype="num").save() # a sheet for treatments treatments_sheet = ln.Record( name="My treatments 2025-05", type=treatment_type, is_type=True ).save() # sheet without validating schema # populate treatment1 treatment1 = ln.Record(name="treatment1", type=treatments_sheet).save() ln.models.RecordRecord(record=treatment1, feature=compound, value=drug1).save() assert drug1 in treatment1.linked_records.all() assert treatment1 in drug1.linked_in_records.all() ln.models.RecordJson(record=treatment1, feature=concentration, value="2nM").save() # populate treatment2 treatment2 = ln.Record(name="treatment2", type=treatments_sheet).save() ln.models.RecordRecord(record=treatment2, feature=compound, value=drug2).save() ln.models.RecordJson(record=treatment2, feature=concentration, value="4nM").save() # Samples --------------------------- # features named id, uid or name conflict with django field names, we test them here id_feature = ln.Feature(name="id", dtype=int).save() uid_feature = ln.Feature(name="uid", dtype=str).save() name_feature = ln.Feature(name="name", dtype=str).save() project = ln.Feature(name="project", dtype=ln.Project).save() project1 = ln.Project(name="Project 1").save() sample_type = ln.Record(name="BioSample", is_type=True).save() treatment = ln.Feature(name="treatment", dtype=treatment_type).save() cell_line = ln.Feature(name="cell_line", dtype=bt.CellLine).save() preparation_date = ln.Feature(name="preparation_date", dtype="datetime").save() cell_line._dtype_str = ( "cat[bionty.CellLine]" # might have previously been set to "cat" ) cell_line.save() sample_schema1 = ln.Schema( name="My samples schema 2025-06", features=[ id_feature, uid_feature, name_feature, treatment, cell_line, preparation_date, project, ], ).save() sample_sheet1 = ln.Record( name="My samples 2025-06", schema=sample_schema1, type=sample_type ).save() # values for cell lines hek293t = bt.CellLine.from_source("HEK293T").save() # populate sample1 sample1 = ln.Record(name="sample1", type=sample_sheet1).save() ln.models.RecordJson(record=sample1, feature=id_feature, value=1).save() ln.models.RecordJson(record=sample1, feature=uid_feature, value="S1").save() ln.models.RecordJson(record=sample1, feature=name_feature, value="Sample 1").save() ln.models.RecordRecord(record=sample1, feature=treatment, value=treatment1).save() bt.models.RecordCellLine(record=sample1, feature=cell_line, value=hek293t).save() ln.models.RecordJson( record=sample1, feature=preparation_date, value="2025-06-01T05:00:00" ).save() ln.models.RecordProject(record=sample1, feature=project, value=project1).save() # populate sample2 sample2 = ln.Record(name="sample2", type=sample_sheet1).save() ln.models.RecordJson(record=sample2, feature=id_feature, value=2).save() ln.models.RecordJson(record=sample2, feature=uid_feature, value="S2").save() ln.models.RecordJson(record=sample2, feature=name_feature, value="Sample 2").save() ln.models.RecordRecord(record=sample2, feature=treatment, value=treatment2).save() bt.models.RecordCellLine(record=sample2, feature=cell_line, value=hek293t).save() ln.models.RecordJson( record=sample2, feature=preparation_date, value="2025-06-01T06:00:00" ).save() ln.models.RecordProject(record=sample2, feature=project, value=project1).save() # another sheet for samples sample_note = ln.Feature(name="sample_note", dtype="str").save() sample_schema2 = ln.Schema( name="My samples schema 2025-07", features=[treatment, cell_line, sample_note, project], ).save() # the sheet sample_sheet2 = ln.Record( name="My samples 2025-07", schema=sample_schema2, type=sample_type ).save() # populate sample3 sample3 = ln.Record(type=sample_sheet2).save() # no name ln.models.RecordRecord(record=sample3, feature=treatment, value=treatment1).save() bt.models.RecordCellLine(record=sample3, feature=cell_line, value=hek293t).save() ln.models.RecordJson( record=sample3, feature=preparation_date, value="2025-06-02T05:00:00Z" ).save() ln.models.RecordProject(record=sample3, feature=project, value=project1).save() # populate sample4 sample4 = ln.Record(type=sample_sheet2).save() ln.models.RecordRecord(record=sample4, feature=treatment, value=treatment2).save() bt.models.RecordCellLine(record=sample4, feature=cell_line, value=hek293t).save() ln.models.RecordJson( record=sample4, feature=preparation_date, value="2025-06-02T06:00:00Z" ).save() ln.models.RecordProject(record=sample4, feature=project, value=project1).save() yield treatments_sheet, sample_sheet1 sample4.delete(permanent=True) sample3.delete(permanent=True) sample_sheet2.delete(permanent=True) sample_schema2.delete(permanent=True) sample_note.delete(permanent=True) sample2.delete(permanent=True) sample1.delete(permanent=True) # hek293t.delete(permanent=True) # not for now sample_sheet1.delete(permanent=True) sample_schema1.delete(permanent=True) preparation_date.delete(permanent=True) cell_line.delete(permanent=True) # sample_type.delete(permanent=True) # not for now treatment2.delete(permanent=True) treatment1.delete(permanent=True) treatments_sheet.delete(permanent=True) treatment_type.delete(permanent=True) concentration.delete(permanent=True) drug2.delete(permanent=True) drug1.delete(permanent=True) structure.delete(permanent=True) compound.delete(permanent=True) compound_type.delete(permanent=True) @pytest.fixture(scope="module") def populate_nextflow_sheet_with_samples(): # Biosample schema and type samples_schema = ln.Schema( name="Biosample test schema", features=[ ln.Feature(name="species", dtype="cat[bionty.Organism]").save(), ln.Feature(name="cell_type", dtype="cat[bionty.CellType]").save(), ln.Feature(name="tissue", dtype="cat[bionty.Tissue]").save(), ], ).save() biosample_type = ln.Record(name="BioSample", is_type=True).save() # Biosamples sheet samples_sheet = ln.Record( name="My samples 2025-04", schema=samples_schema, type=biosample_type ).save() sample_x = ln.Record(name="Sample_X", type=samples_sheet).save() sample_y = ln.Record(name="Sample_Y", type=samples_sheet).save() organism_human = bt.Organism.from_source(name="human").save() celltype_tcell = bt.CellType.from_source(name="T cell").save() tissue_blood = bt.Tissue.from_source(name="blood").save() features = ln.Feature.lookup() for sample in [sample_x, sample_y]: bt.models.RecordOrganism( record=sample, feature=features.species, value=organism_human ).save() bt.models.RecordCellType( record=sample, feature=features.cell_type, value=celltype_tcell ).save() bt.models.RecordTissue( record=sample, feature=features.tissue, value=tissue_blood ).save() # Nextflow samplesheet schema nextflow_schema = ln.Schema( name="RNA-seq standard", features=[ ln.Feature(name="sample", dtype=biosample_type).save(), ln.Feature(name="fastq_1", dtype=str).save(), ln.Feature(name="fastq_2", dtype=str).save(), ln.Feature(name="expected_cells", dtype=int).save(), ln.Feature(name="seq_center", dtype=str).save().with_config(optional=True), ], ordered_set=True, ).save() nextflowsample_type = ln.Record(name="NextflowSample", is_type=True).save() nextflow_sheet = ln.Record( schema=nextflow_schema, name="RNA-seq nextflow samplesheet 001", type=nextflowsample_type, is_type=True, ).save() sample_data = { "sample": ["Sample_X", "Sample_Y", "Sample_Y"], "fastq_1": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz", ], "fastq_2": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz", ], "expected_cells": [5000, 5000, 5000], } df = pd.DataFrame(sample_data) features = ln.Feature.lookup() nextflow_samples = [] for _, row in df.iterrows(): sample = ln.Record(type=nextflow_sheet).save() nextflow_samples.append(sample) ln.models.RecordRecord( record=sample, feature=features.sample, value=ln.Record.get(name=row["sample"]), ).save() ln.models.RecordJson( record=sample, feature=features.fastq_1, value=row["fastq_1"] ).save() ln.models.RecordJson( record=sample, feature=features.fastq_2, value=row["fastq_2"] ).save() ln.models.RecordJson( record=sample, feature=features.expected_cells, value=row["expected_cells"] ).save() yield nextflow_sheet # Delete in reverse order of creation # Delete nextflow samples for sample in reversed(nextflow_samples): sample.delete(permanent=True) # Delete nextflow sheet and schema nextflow_sheet.delete(permanent=True) nextflowsample_type.delete(permanent=True) nextflow_schema.delete(permanent=True) # Delete samples sheet and schema samples_sheet.records.all().delete(permanent=True) samples_sheet.delete(permanent=True) # biosample_type.delete(permanent=True) # not for now (shared with first fixture) samples_schema.delete(permanent=True) print(ln.Schema.to_dataframe()) # Delete nextflow schema features features = ln.Feature.lookup() features.seq_center.delete(permanent=True) features.expected_cells.delete(permanent=True) features.fastq_2.delete(permanent=True) features.fastq_1.delete(permanent=True) features.sample.delete(permanent=True) # Delete biosamples sample_y.delete(permanent=True) sample_x.delete(permanent=True) # Delete biosample schema features features.tissue.delete(permanent=True) features.cell_type.delete(permanent=True) features.species.delete(permanent=True) # Note: organism_human, celltype_tcell, tissue_blood are from bionty # and might be shared, so not deleting them (similar to hek293t in first fixture) ================================================ FILE: lamindb/examples/mlflow/__init__.py ================================================ """Examples and utilities for Mlflow. .. autofunction:: save_mlflow_features """ import lamindb as ln def save_mlflow_features(): """Saves all MLflow experiment and run related features. Saves the following features: - mlflow_run_id - mlflow_run_name - mlflow_experiment_id - mlflow_experiment_name - mlflow_user_id - mlflow_status - mlflow_lifecycle_stage - mlflow_artifact_uri - mlflow_start_time - mlflow_end_time """ mlflow_type = ln.Feature(name="MLflow", is_type=True).save() ln.Feature(name="mlflow_run_id", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_run_name", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_experiment_id", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_experiment_name", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_user_id", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_status", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_lifecycle_stage", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_artifact_uri", dtype=str, type=mlflow_type).save() ln.Feature(name="mlflow_start_time", dtype=int, type=mlflow_type).save() ln.Feature(name="mlflow_end_time", dtype=int, type=mlflow_type).save() ================================================ FILE: lamindb/examples/schemas/__init__.py ================================================ """Example schemas. .. autofunction:: valid_features .. autofunction:: anndata_ensembl_gene_ids_and_valid_features_in_obs """ from ._anndata import anndata_ensembl_gene_ids_and_valid_features_in_obs from ._simple import valid_features ================================================ FILE: lamindb/examples/schemas/_anndata.py ================================================ from __future__ import annotations import importlib from typing import TYPE_CHECKING if TYPE_CHECKING: from ... import Schema def anndata_ensembl_gene_ids_and_valid_features_in_obs() -> Schema: """An `AnnData` schema validating Ensembl gene IDs and valid features in obs. .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py :language: python """ from ... import Schema try: return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs") except Schema.DoesNotExist: from . import define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs # noqa try: return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs") except Schema.DoesNotExist: importlib.reload( define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs ) return Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs") ================================================ FILE: lamindb/examples/schemas/_simple.py ================================================ from __future__ import annotations import importlib from typing import TYPE_CHECKING if TYPE_CHECKING: from ... import Schema def valid_features() -> Schema: """A `DataFrame` schema that validates that columns map on existing features. .. literalinclude:: scripts/define_valid_features.py :language: python """ from ... import Schema try: return Schema.get(name="valid_features") except Schema.DoesNotExist: try: from . import define_valid_features # noqa return Schema.get(name="valid_features") except Schema.DoesNotExist: importlib.reload(define_valid_features) return Schema.get(name="valid_features") ================================================ FILE: lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py ================================================ import bionty as bt import lamindb as ln obs_schema = ln.examples.schemas.valid_features() varT_schema = ln.Schema( name="valid_ensembl_gene_ids", itype=bt.Gene.ensembl_gene_id ).save() schema = ln.Schema( name="anndata_ensembl_gene_ids_and_valid_features_in_obs", otype="AnnData", slots={"obs": obs_schema, "var.T": varT_schema}, ).save() ================================================ FILE: lamindb/examples/schemas/define_valid_features.py ================================================ import lamindb as ln schema = ln.Schema(name="valid_features", itype=ln.Feature).save() ================================================ FILE: lamindb/examples/wandb/__init__.py ================================================ """Examples and utilities for Weights & Biases. .. autofunction:: save_wandb_features """ import lamindb as ln def save_wandb_features(): """Saves all Weights & Biases project and run related features. Saves the following features: - wandb_run_id - wandb_run_name - wandb_run_entity - wandb_project - wandb_state - wandb_url - wandb_tags - wandb_group - wandb_job_type - timestamp - runtime """ wandb_type = ln.Feature(name="Weights & Biases", is_type=True).save() ln.Feature(name="wandb_run_id", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_run_name", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_run_entity", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_project", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_state", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_url", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_tags", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_group", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_job_type", dtype=str, type=wandb_type).save() ln.Feature(name="wandb_timestamp", dtype=float, type=wandb_type).save() ln.Feature(name="wandb_runtime", dtype=float, type=wandb_type).save() ================================================ FILE: lamindb/integrations/__init__.py ================================================ """Integrations. Modules ------- .. autosummary:: :toctree: . lightning Functions --------- .. autofunction:: save_vitessce_config .. autofunction:: save_tiledbsoma_experiment .. autofunction:: curate_from_croissant """ from ._croissant import curate_from_croissant from ._vitessce import save_vitessce_config __all__ = [ "lightning", "save_tiledbsoma_experiment", "curate_from_croissant", "save_vitessce_config", ] def __getattr__(name: str): """Lazy-import save_tiledbsoma_experiment to avoid loading storage at package import.""" if name == "save_tiledbsoma_experiment": from lamindb.core.storage import save_tiledbsoma_experiment return save_tiledbsoma_experiment raise AttributeError(f"module {__name__!r} has no attribute {name!r}") ================================================ FILE: lamindb/integrations/_croissant.py ================================================ from __future__ import annotations import json from pathlib import Path from typing import TYPE_CHECKING, Any import lamindb_setup as ln_setup from lamin_utils import logger from lamindb_setup.core.upath import UPath if TYPE_CHECKING: from lamindb_setup.types import AnyPathStr import lamindb as ln def curate_from_croissant( croissant_data: AnyPathStr | dict[str, Any], run: ln.Run | None = None, ) -> ln.Artifact | ln.Collection: """Create annotated artifacts from a CroissantML file. Returns a collection if multiple files are found in `croissant_data`, otherwise a single artifact. Args: croissant_data: Path to CroissantML JSON file or dictionary. Example: :: artifact = ln.integrations.curate_from_croissant("dataset_metadata.json") """ import lamindb as ln from ..models.artifact import check_path_in_existing_storage # Load CroissantML data if isinstance(croissant_data, (str, Path, UPath)): croissant_path = UPath(croissant_data) if not croissant_path.exists(): raise FileNotFoundError(f"File not found: {croissant_data}") with croissant_path.open(encoding="utf-8") as f: data = json.load(f) elif isinstance(croissant_data, dict): data = croissant_data else: raise ValueError( "croissant_data must be a file path, JSON string, or dictionary" ) # Validate basic structure if data.get("@type") != "Dataset": raise ValueError("CroissantML @type must be 'Dataset'") if "name" not in data: raise ValueError("CroissantML must have a 'name' field") # Extract basic metadata dataset_name = data["name"] description = data.get("description", None) version = data.get("version", None) license_info = data.get("license", None) project_name = data.get("cr:projectName", None) # Create license feature and label if license info exists license_label = None if license_info: license_label_type = ln.ULabel.filter(name="License", is_type=True).first() if not license_label_type: license_label_type = ln.ULabel(name="License", is_type=True).save() license_label = ln.ULabel.filter(name=license_info).first() if not license_label: license_label = ln.ULabel( name=license_info, description="Dataset license", type=license_label_type, ).save() project_label = None if project_name: project_label = ln.Project.filter(name=project_name).first() if not project_label: project_label = ln.Project(name=project_name).save() # Extract file distributions artifacts = [] file_distributions = data.get("distribution", []) if not file_distributions: raise ValueError("No file distributions found in croissant data") for dist in file_distributions: file_id = dist.get("@id", "") if UPath(file_id).exists(): file_path = file_id else: content_url = dist.get("contentUrl", "") file_path = content_url or data.get("url", "") if not file_path: raise ValueError(f"No file path found in croissant distribution: {dist}") if not UPath(file_path).exists(): raise ValueError(f"Inferred file path does not exist: {file_path}") result = check_path_in_existing_storage( file_path, check_hub_register_storage=ln_setup.settings.instance.is_on_hub ) if isinstance(result, ln.Storage): key = None # will automatically use existing storage key else: current_storage_location = ( ln.settings.storage if not ln.setup.settings.instance.keep_artifacts_local else ln.settings.local_storage ) logger.warning( f"file path {file_path} is not part of a known storage location, will be duplicated to: {current_storage_location}" ) key = file_id if len(file_distributions) == 1: # it doesn't make sense to have the dataset name on the individual # artifact if it's part of a collection artifact_description = dataset_name if description is not None: artifact_description += f" - {description}" else: artifact_description = None artifact = ln.Artifact( # type: ignore file_path, key=key, description=artifact_description, version=version, kind="dataset", run=run, ).save() if license_label: artifact.ulabels.add(license_label) if project_label: artifact.projects.add(project_label) artifacts.append(artifact) if len(artifacts) == 1: return artifacts[0] else: collection = ln.Collection( # type: ignore artifacts, key=dataset_name, description=description, version=version ).save() if license_label: collection.ulabels.add(license_label) if project_label: collection.projects.add(project_label) return collection ================================================ FILE: lamindb/integrations/_vitessce.py ================================================ from __future__ import annotations import json from datetime import datetime, timezone from typing import TYPE_CHECKING import lamindb_setup as ln_setup from lamin_utils import logger from lamindb.models.artifact import Artifact from lamindb.models.collection import Collection from lamindb.models.run import Run from lamindb.models.transform import Transform if TYPE_CHECKING: from vitessce import VitessceConfig # "unit test": https://github.com/laminlabs/lamindb/blob/main/docs/storage/vitessce.ipynb # integration test & context: https://github.com/laminlabs/lamin-spatial/blob/main/docs/vitessce.ipynb def save_vitessce_config( vitessce_config: VitessceConfig, key: str | None = None, description: str | None = None, ) -> Artifact: """Validates and saves a `VitessceConfig` object. If the `VitessceConfig` object references multiple artifacts, automatically creates a `Collection` and displays the "Vitessce button" next to it. The `VitessceConfig` artifact has `.suffix = ".vitessce.json"` and `.kind = "__lamindb_config__"`, which is by default hidden on the hub UI. Guide: :doc:`docs:vitessce`. Args: vitessce_config: A `VitessceConfig` object. key: A `key` for the `VitessceConfig` artifact. description: A `description` for the `VitessceConfig` aritifact. Is additionally used as `key` for a `Collection` in case the `VitessceConfig` object references multiple artifacts. """ # can only import here because vitessce is not a dependency from vitessce import VitessceConfig assert isinstance(vitessce_config, VitessceConfig) # noqa: S101 vc_dict = vitessce_config.to_dict() try: url_to_artifact_dict = vitessce_config.get_artifacts() except AttributeError as e: raise SystemExit( "save_vitessce_config() requires vitessce>=3.4.0: pip install vitessce>=3.4.0" ) from e dataset_artifacts = list(url_to_artifact_dict.values()) message = "\n".join([artifact.__repr__() for artifact in dataset_artifacts]) logger.important(f"VitessceConfig references these artifacts:\n{message}") assert len(dataset_artifacts) > 0 # noqa: S101 # the below will be replaced with a `ln.step()` decorator soon transform = Transform( # type: ignore uid="kup03MJBsIVa0002", key="save_vitessce_config", type="function", version="3", ).save() run = Run(transform=transform).save() run.input_artifacts.set(dataset_artifacts) collection = None if len(dataset_artifacts) > 1: # if we have more datasets, we should create a collection # and attach an action to the collection # consicious use of description for key, see here # https://github.com/laminlabs/lamindb/pull/2997 collection = Collection(dataset_artifacts, key=description).save() # create a JSON export config_file_local_path = ln_setup.settings.cache_dir / "config.vitessce.json" with open(config_file_local_path, "w") as file: json.dump(vc_dict, file) vitessce_config_artifact = Artifact( config_file_local_path, key=key, description=description, run=run, kind="__lamindb_config__", ).save() slug = ln_setup.settings.instance.slug logger.important( f"VitessceConfig: https://lamin.ai/{slug}/artifact/{vitessce_config_artifact.uid}" ) if collection is None: # we have one and only one dataset artifact, hence the following line is OK dataset_artifacts[0]._actions.add(vitessce_config_artifact) logger.important( f"Dataset: https://lamin.ai/{slug}/artifact/{dataset_artifacts[0].uid}" ) else: collection._actions.add(vitessce_config_artifact) logger.important( f"Collection: https://lamin.ai/{slug}/collection/{collection.uid}" ) run.finished_at = datetime.now(timezone.utc) run.save() return vitessce_config_artifact ================================================ FILE: lamindb/integrations/lightning.py ================================================ """PyTorch Lightning integration for LaminDB. The public API has two layers: - :class:`Checkpoint` is the concrete LaminDB implementation that persists checkpoint, config, and `hparams.yaml` files as :class:`~lamindb.Artifact` objects and annotates them with :class:`~lamindb.Feature` objects. - :class:`ArtifactPublishingModelCheckpoint` is the generic extension layer adding checkpoint artifact lifecycle hooks without implementing Lamin persistence details yet. External integrations can either subclass :class:`Checkpoint` directly or attach an :class:`ArtifactObserver` to react to saved and removed artifacts. Here is a guide: :doc:`lightning`. Main API -------- .. autoclass:: Checkpoint .. autofunction:: save_lightning_features Auxiliary classes ----------------- .. autoclass:: ArtifactPublishingModelCheckpoint .. autoclass:: SaveConfigCallback .. autoclass:: ArtifactSavedEvent .. autoclass:: ArtifactRemovedEvent """ from __future__ import annotations import warnings from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Final, Literal, Protocol import lightning.pytorch as pl from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint from lightning.pytorch.cli import SaveConfigCallback as _SaveConfigCallback import lamindb as ln from lamindb.models.artifact import track_run_input if TYPE_CHECKING: from datetime import timedelta from lightning.fabric.utilities.types import _PATH _RUN_AUTO_FEATURES: Final = frozenset( { "logger_name", "logger_version", "max_epochs", "max_steps", "precision", "accumulate_grad_batches", "gradient_clip_val", "monitor", "mode", } ) _ARTIFACT_AUTO_FEATURES: Final = frozenset( { "is_best_model", "is_last_model", "score", "model_rank", "save_weights_only", "monitor", "mode", } ) _SUPPORTED_AUTO_FEATURES: Final = _RUN_AUTO_FEATURES | _ARTIFACT_AUTO_FEATURES ArtifactKind = Literal["checkpoint", "config", "hparams"] @dataclass(frozen=True) class ArtifactEvent: """Common metadata emitted when a checkpoint-related artifact changes. The event records the logical artifact key, the local path Lightning wrote, and the trainer that triggered the lifecycle event. """ kind: ArtifactKind key: str local_path: Path trainer: pl.Trainer @dataclass(frozen=True) class ArtifactSavedEvent(ArtifactEvent): """Metadata emitted after a checkpoint-related artifact has been persisted. `artifact` is intentionally typed generically so downstream integrations can expose their own persisted object while still using the common lifecycle API. `storage_uri` is the stable hand-off value for registries such as ClearML. """ artifact: Any storage_uri: str @dataclass(frozen=True) class ArtifactRemovedEvent(ArtifactEvent): """Metadata emitted after a local checkpoint file has been removed. Removal currently applies to checkpoint files. Config and hparams artifacts are save-only in the current Lightning integration. """ artifact: Any | None = None storage_uri: str | None = None class ArtifactObserver(Protocol): """Observer notified about checkpoint artifact lifecycle events. This is the preferred composition hook for downstream integrations that need to register checkpoints elsewhere after Lamin persistence completes. """ def on_artifact_saved(self, event: ArtifactSavedEvent) -> None: ... def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None: ... class ArtifactPublisher(Protocol): """Persistence backend for checkpoint-related artifacts. :class:`ArtifactPublishingModelCheckpoint` manages the artifact lifecycle, while publishers encapsulate backend-specific save behavior and storage URI resolution. """ def create_artifact( self, local_path: Path | str, *, key: str, description: str, kind: str | None = None, add_as_input_to_run: bool = False, skip_hash_lookup: bool = False, ) -> Any: ... def storage_uri(self, artifact: Any) -> str: ... class LaminArtifactPublisher: """Persist checkpoint-related artifacts into LaminDB. This service is intentionally separate from :class:`Checkpoint` so that the checkpoint callback can focus on Lightning behavior and feature handling while persistence details remain replaceable. """ def create_artifact( self, local_path: Path | str, *, key: str, description: str, kind: str | None = None, add_as_input_to_run: bool = False, skip_hash_lookup: bool = False, ) -> ln.Artifact: artifact_kwargs: dict[str, Any] = {"key": key, "description": description} if kind is not None: artifact_kwargs["kind"] = kind if add_as_input_to_run: artifact_kwargs["run"] = False if skip_hash_lookup: artifact_kwargs["skip_hash_lookup"] = True artifact = ln.Artifact(local_path, **artifact_kwargs) artifact.save() if add_as_input_to_run: track_run_input(artifact, is_run_input=True) return artifact def storage_uri(self, artifact: ln.Artifact) -> str: return str(artifact.path) def save_lightning_features() -> None: """Save features to auto-track lightning parameters & metrics. Creates the following features under the `lamindb.lightning` feature type if they do not already exist: Artifact-level features: - `is_best_model` (bool): Whether this checkpoint is the best model. - `is_last_model` (bool): Whether this checkpoint is the most recently saved model. - `score` (float): The monitored metric score. - `model_rank` (int): Rank among all checkpoints (0 = best). - `save_weights_only` (bool): Whether this checkpoint only stores model weights. - `monitor` (str): Metric name this checkpoint uses for comparison. - `mode` (str): Optimization mode (`min` or `max`) used for checkpoint ranking. Run-level features: - `logger_name` (str): Name from the first Lightning logger. - `logger_version` (str): Version from the first Lightning logger. - `max_epochs` (int): Maximum number of epochs. - `max_steps` (int): Maximum number of training steps. - `precision` (str): Training precision (e.g., "32", "16-mixed", "bf16"). - `accumulate_grad_batches` (int): Number of batches to accumulate gradients over. - `gradient_clip_val` (float): Gradient clipping value. - `monitor` (str): Metric name being monitored. - `mode` (str): Optimization mode ("min" or "max"). Args: None. Example: Save the features to the database:: from lamindb.integrations import lightning as ll ll.save_lightning_features() """ # normal matching fails because of non-matching dtype (__lamindb_lightning__ vs None) if ( lightning_feature_type := ln.Feature.filter( name="lamindb.lightning" ).one_or_none() ) is None: lightning_feature_type = ln.Feature( # type: ignore[call-overload] name="lamindb.lightning", description="Auto-generated features tracking lightning parameters & metrics", is_type=True, ) lightning_feature_type._dtype_str = "__lamindb_lightning__" lightning_feature_type.save() ln.Feature(name="is_best_model", dtype=bool, type=lightning_feature_type).save() ln.Feature(name="is_last_model", dtype=bool, type=lightning_feature_type).save() ln.Feature(name="score", dtype=float, type=lightning_feature_type).save() ln.Feature(name="model_rank", dtype=int, type=lightning_feature_type).save() ln.Feature(name="logger_name", dtype=str, type=lightning_feature_type).save() ln.Feature(name="logger_version", dtype=str, type=lightning_feature_type).save() ln.Feature(name="max_epochs", dtype=int, type=lightning_feature_type).save() ln.Feature(name="max_steps", dtype=int, type=lightning_feature_type).save() ln.Feature(name="precision", dtype=str, type=lightning_feature_type).save() ln.Feature( name="accumulate_grad_batches", dtype=int, type=lightning_feature_type ).save() ln.Feature( name="gradient_clip_val", dtype=float, type=lightning_feature_type ).save() ln.Feature(name="monitor", dtype=str, type=lightning_feature_type).save() ln.Feature(name="save_weights_only", dtype=bool, type=lightning_feature_type).save() ln.Feature(name="mode", dtype=str, type=lightning_feature_type).save() class FeatureAnnotator: """Manages Lightning feature discovery, collection, and annotation. This helper encapsulates all feature-related state and logic used by :class:`Checkpoint`. It handles: - Validation of user-specified features at setup time - Discovery of auto-features created by :func:`save_lightning_features` - Collection of run-level and checkpoint-level feature values - Best-model flag management and model rank updates The annotator is decoupled from `ModelCheckpoint` state — checkpoint-specific values (`best_model_path`, `current_score`, `mode`, etc.) are passed as explicit arguments to collection methods. """ def __init__( self, features: dict[Literal["run", "artifact"], dict[str, Any]] | None = None, ) -> None: user_features = features or {} if invalid_keys := set(user_features) - {"run", "artifact"}: # type: ignore raise ValueError( f"Invalid feature keys: {invalid_keys}. Use 'run' and/or 'artifact'." ) self._run_features: dict[str, Any] = user_features.get("run", {}) self._artifact_features: dict[str, Any] = user_features.get("artifact", {}) self._auto_features: dict[str, ln.Feature] = {} self._hparam_features_available: set[str] = set() self._run_features_saved = False def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None: """Validate user features and discover auto-features. Must be called during `Checkpoint.setup()` while `trainer.is_global_zero` is `True`. """ self._validate_user_features() self._attach_user_run_features() self._discover_auto_features() self._discover_hparam_features(trainer, pl_module) def _attach_user_run_features(self) -> None: """Attach user-specified run features to the active LaminDB run.""" if ln.context.run and self._run_features: ln.context.run.features.add_values(self._run_features) def _validate_user_features(self) -> None: """Ensure all user-specified feature names exist in the database.""" all_feature_names = set(self._run_features) | set(self._artifact_features) if not all_feature_names: return existing = set( ln.Feature.filter(name__in=all_feature_names).values_list("name", flat=True) ) missing = [n for n in all_feature_names if n not in existing] if missing: s = "s" if len(missing) > 1 else "" raise ValueError( f"Feature{s} {', '.join(missing)} missing. " f"Create {'them' if len(missing) > 1 else 'it'} first." ) def _discover_auto_features(self) -> None: """Load auto-features scoped to the `lamindb.lightning` feature type.""" lightning_feature_type = ln.Feature.filter( name="lamindb.lightning", is_type=True ).one_or_none() self._auto_features.clear() if lightning_feature_type is not None: self._auto_features = { f.name: f for f in ln.Feature.filter( name__in=_SUPPORTED_AUTO_FEATURES, type=lightning_feature_type, ) } def _discover_hparam_features( self, trainer: pl.Trainer, pl_module: pl.LightningModule ) -> None: """Find which hyperparameter names have matching Features in the DB.""" hparam_names = self._collect_hparam_names(pl_module, trainer.datamodule) self._hparam_features_available = ( set(ln.Feature.filter(name__in=hparam_names).values_list("name", flat=True)) if hparam_names else set() ) @staticmethod def _collect_hparam_names(*sources: Any) -> set[str]: """Gather hyperparameter names from one or more sources.""" names: set[str] = set() for source in sources: if source is not None and hasattr(source, "hparams") and source.hparams: names.update(source.hparams.keys()) return names def get(self, name: str) -> ln.Feature | None: """Return the typed auto-feature for *name*, or `None`.""" return self._auto_features.get(name) def _set(self, target: dict[str | ln.Feature, Any], name: str, value: Any) -> None: """Add *value* to *target* if the auto-feature *name* is tracked and *value* is not `None`.""" if (feature := self.get(name)) and value is not None: target[feature] = value def save_run_features( self, trainer: pl.Trainer, monitor: str | None, mode: str, ) -> None: """Collect and attach run-level features once per run. Idempotent — subsequent calls are no-ops. """ if not ln.context.run or self._run_features_saved: return run_features = self._collect_run_features(trainer, monitor, mode) if run_features: ln.context.run.features.add_values(run_features) self._run_features_saved = True def _collect_run_features( self, trainer: pl.Trainer, monitor: str | None, mode: str, ) -> dict[str | ln.Feature, Any]: """Build the dict of run-level feature values (pure, no DB writes).""" run_features: dict[str | ln.Feature, Any] = {} if trainer.loggers: self._set(run_features, "logger_name", trainer.loggers[0].name) version = trainer.loggers[0].version self._set( run_features, "logger_version", version if isinstance(version, str) else f"version_{version}", ) # Trainer config values self._add_trainer_config_features(run_features, trainer, monitor, mode) # Hyperparameters self._add_hparam_features( run_features, trainer.lightning_module, trainer.datamodule ) return run_features def _add_trainer_config_features( self, target: dict[str | ln.Feature, Any], trainer: pl.Trainer, monitor: str | None, mode: str, ) -> None: """Append trainer configuration values to *target*.""" self._set(target, "max_epochs", trainer.max_epochs) self._set(target, "max_steps", trainer.max_steps) self._set(target, "precision", str(trainer.precision)) self._set(target, "accumulate_grad_batches", trainer.accumulate_grad_batches) self._set(target, "gradient_clip_val", trainer.gradient_clip_val) self._set(target, "monitor", monitor) self._set(target, "mode", mode) def _add_hparam_features( self, target: dict[str | ln.Feature, Any], *sources: Any, ) -> None: """Append hyperparameter values from one or more sources to *target*.""" for source in sources: if source is None: continue if hasattr(source, "hparams") and source.hparams: for name, value in source.hparams.items(): if name in self._hparam_features_available: target[name] = value def collect_checkpoint_features( self, trainer: pl.Trainer, is_best: bool, current_score: Any | None, save_weights_only: bool, monitor: str | None, mode: str, ) -> dict[str | ln.Feature, Any]: """Collect feature values for a checkpoint artifact. All `ModelCheckpoint` state is passed as explicit arguments so the annotator stays decoupled from the callback class hierarchy. Does **not** mutate existing artifacts — call :meth:`clear_best_model_flags` or :meth:`clear_last_model_flags` separately when needed. """ feature_values: dict[str | ln.Feature, Any] = {} self._set(feature_values, "is_best_model", is_best) self._set(feature_values, "is_last_model", True) if current_score is not None: score = current_score if hasattr(score, "item"): score = score.item() self._set(feature_values, "score", float(score)) self._set(feature_values, "save_weights_only", save_weights_only) self._set(feature_values, "monitor", monitor) self._set(feature_values, "mode", mode) # User-specified artifact features for name, value in self._artifact_features.items(): if value is not None: feature_values[name] = value elif hasattr(trainer, name): feature_values[name] = getattr(trainer, name) elif name in trainer.callback_metrics: metric = trainer.callback_metrics[name] feature_values[name] = ( metric.item() if hasattr(metric, "item") else float(metric) ) return feature_values def clear_best_model_flags(self, checkpoint_key_prefix: str) -> None: """Set `is_best_model=False` on previous best checkpoints.""" self._clear_flagged_model_feature("is_best_model", checkpoint_key_prefix) def clear_last_model_flags(self, checkpoint_key_prefix: str) -> None: """Set `is_last_model=False` on previous latest checkpoints.""" self._clear_flagged_model_feature("is_last_model", checkpoint_key_prefix) def _clear_flagged_model_feature( self, feature_name: Literal["is_best_model", "is_last_model"], checkpoint_key_prefix: str, ) -> None: """Set a boolean model flag to `False` on previously flagged checkpoints.""" feature = self.get(feature_name) if feature is None: return feature_rows = self._get_artifact_feature_rows( {feature_name}, checkpoint_key_prefix ) artifact_ids = [ artifact_id for artifact_id, values in feature_rows.items() if values.get(feature_name) is True ] if not artifact_ids: return artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)} for artifact_id in artifact_ids: if artifact_id not in artifacts_by_id: continue artifact = artifacts_by_id[artifact_id] artifact.features.remove_values(feature, value=True) artifact.features.add_values({feature: False}) def update_model_ranks(self, checkpoint_key_prefix: str, mode: str) -> None: """Re-rank all checkpoint artifacts under *checkpoint_key_prefix*.""" model_rank_feature = self.get("model_rank") if model_rank_feature is None: return feature_rows = self._get_artifact_feature_rows( {"score", "model_rank"}, checkpoint_key_prefix ) scored = [] for artifact_id, values in feature_rows.items(): if "score" in values: scored.append((values["score"], values.get("model_rank"), artifact_id)) scored.sort(key=lambda x: x[0], reverse=(mode == "max")) artifact_ids = [artifact_id for _, _, artifact_id in scored] artifacts_by_id = {a.id: a for a in ln.Artifact.filter(id__in=artifact_ids)} for rank, (_, old_rank, artifact_id) in enumerate(scored): if artifact_id not in artifacts_by_id: continue af = artifacts_by_id[artifact_id] if old_rank is not None: af.features.remove_values(model_rank_feature, value=old_rank) af.features.add_values({model_rank_feature: rank}) def _get_artifact_feature_rows( self, feature_names: set[str], checkpoint_key_prefix: str, ) -> dict[int, dict[str, Any]]: """Query feature values for checkpoint artifacts under *checkpoint_key_prefix*. Returns a dict keyed by artifact ID, where each value is a dict mapping feature name to its stored value. Example:: { 42: {"score": 0.95, "is_best_model": True}, 71: {"score": 0.87, "is_best_model": False, "model_rank": 1}, } """ feature_ids = [ feature.id for name in feature_names if (feature := self.get(name)) ] key_startswith = checkpoint_key_prefix + "/" if feature_ids: rows = ln.models.ArtifactJsonValue.filter( artifact__key__startswith=key_startswith, jsonvalue__feature_id__in=feature_ids, ).values_list("artifact_id", "jsonvalue__feature__name", "jsonvalue__value") else: rows = ln.models.ArtifactJsonValue.filter( artifact__key__startswith=key_startswith, jsonvalue__feature__name__in=feature_names, ).values_list("artifact_id", "jsonvalue__feature__name", "jsonvalue__value") result: dict[int, dict[str, Any]] = {} for artifact_id, feature_name, value in rows: if artifact_id not in result: result[artifact_id] = {} result[artifact_id][feature_name] = value return result class ArtifactPublishingModelCheckpoint(ModelCheckpoint): """ModelCheckpoint with observable artifact lifecycle hooks. This layer captures artifact kinds, observer registration, saved/removed events, latest artifact tracking, and key compatibility hooks. Concrete subclasses remain responsible for how artifacts are persisted. Subclasses are expected to implement: - :meth:`resolve_artifact_key` to map local files to logical artifact keys - :meth:`resolve_artifact_storage_uri` to expose a stable backend URI - :meth:`save_checkpoint_artifact`, :meth:`save_config_artifact`, and :meth:`save_hparams_artifact` to persist files :class:`SaveConfigCallback` only depends on this base class, which means a custom checkpoint callback can participate in config saving without inheriting from Lamin's concrete :class:`Checkpoint`. """ def __init__( self, *args: Any, artifact_observers: list[ArtifactObserver] | None = None, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self._artifact_observers: list[ArtifactObserver] = list( artifact_observers or [] ) self._latest_artifacts: dict[ArtifactKind, Any | None] = { "checkpoint": None, "config": None, "hparams": None, } self._last_artifact_event: ArtifactSavedEvent | ArtifactRemovedEvent | None = ( None ) @property def last_checkpoint_artifact(self) -> Any | None: """The most recently saved checkpoint artifact handle.""" return self._latest_artifacts["checkpoint"] @property def last_config_artifact(self) -> Any | None: """The most recently saved config artifact handle.""" return self._latest_artifacts["config"] @property def last_hparams_artifact(self) -> Any | None: """The most recently saved hparams artifact handle.""" return self._latest_artifacts["hparams"] @property def last_artifact_event(self) -> ArtifactSavedEvent | ArtifactRemovedEvent | None: """The last artifact lifecycle event emitted by this callback.""" return self._last_artifact_event def get_last_artifact(self, kind: ArtifactKind) -> Any | None: """Return the most recently saved artifact for a given artifact kind.""" return self._latest_artifacts[kind] def add_artifact_observer(self, observer: ArtifactObserver) -> None: """Register an observer notified about artifact lifecycle events.""" self._artifact_observers.append(observer) def remove_artifact_observer(self, observer: ArtifactObserver) -> None: """Unregister a previously added artifact observer.""" self._artifact_observers.remove(observer) def resolve_artifact_storage_uri(self, artifact: Any) -> str: """Resolve the physical location for a persisted artifact.""" raise NotImplementedError def resolve_artifact_key( self, trainer: pl.Trainer, filepath: Path | str, kind: ArtifactKind, ) -> str: """Return the logical artifact key for a checkpoint-related file.""" raise NotImplementedError def _notify_artifact_saved( self, trainer: pl.Trainer, *, kind: ArtifactKind, key: str, artifact: Any, local_path: Path | str, ) -> ArtifactSavedEvent: event = ArtifactSavedEvent( kind=kind, key=key, local_path=Path(local_path), trainer=trainer, artifact=artifact, storage_uri=self.resolve_artifact_storage_uri(artifact), ) self._latest_artifacts[kind] = artifact self._last_artifact_event = event self.on_artifact_saved(event) self._notify_artifact_observers("on_artifact_saved", event) return event def _notify_artifact_removed( self, trainer: pl.Trainer, *, kind: ArtifactKind, key: str, local_path: Path | str, artifact: Any | None, ) -> ArtifactRemovedEvent: storage_uri = None if artifact is not None: storage_uri = self.resolve_artifact_storage_uri(artifact) event = ArtifactRemovedEvent( kind=kind, key=key, local_path=Path(local_path), trainer=trainer, artifact=artifact, storage_uri=storage_uri, ) self._last_artifact_event = event self.on_artifact_removed(event) self._notify_artifact_observers("on_artifact_removed", event) return event def _notify_artifact_observers( self, method_name: str, event: ArtifactSavedEvent | ArtifactRemovedEvent, ) -> None: for observer in tuple(self._artifact_observers): method = getattr(observer, method_name, None) if callable(method): method(event) def on_artifact_saved(self, event: ArtifactSavedEvent) -> None: """Hook for subclasses after an artifact has been saved.""" del event def on_artifact_removed(self, event: ArtifactRemovedEvent) -> None: """Hook for subclasses after a checkpoint file has been removed.""" del event def save_checkpoint_artifact( self, trainer: pl.Trainer, filepath: Path | str, *, feature_values: dict[str, Any] | None = None, ) -> Any: """Persist a checkpoint artifact and emit the corresponding event.""" del trainer, filepath, feature_values raise NotImplementedError def save_config_artifact(self, trainer: pl.Trainer, config_path: Path | str) -> Any: """Persist a config artifact and emit the corresponding event.""" del trainer, config_path raise NotImplementedError def save_hparams_artifact( self, trainer: pl.Trainer, hparams_path: Path | str ) -> Any | None: """Persist an hparams artifact and emit the corresponding event.""" del trainer, hparams_path raise NotImplementedError class Checkpoint(ArtifactPublishingModelCheckpoint): """A `ModelCheckpoint` that annotates `pytorch` `lightning` checkpoints. Extends `lightning`'s `ModelCheckpoint` with artifact creation & feature annotation. Each checkpoint is a separate artifact whose key is derived from either the explicit `dirpath` or the trainer's logger configuration. When `dirpath` is omitted (recommended), Lightning decides where to store checkpoints locally (typically `lightning_logs/version_N/checkpoints/`) and the artifact key is derived from the logger's `save_dir`, `name`, and `version`. When `dirpath` is provided, it is used directly as the key prefix. All artifacts are scoped under a single **base prefix**. Checkpoints (and `hparams.yaml`) live under `{base}/checkpoints/`; other artifacts (e.g. `config.yaml`) live directly under `{base}/`. Base prefix derivation (highest priority first): 1. `dirpath` provided → `{dirpath}` (logger is ignored for key purposes) 2. `dirpath` omitted, logger present → `{save_dir_basename}/{name}/{version}` 3. `dirpath` omitted, no logger → empty When `run_uid_is_version` is `True` (the default) and a Lamin run context is active, the run UID is incorporated into the base prefix: - Case 1/3: the run UID is appended as an extra path segment (e.g. `my/dir/{run_uid}`, or just `{run_uid}`). - Case 2: the logger's auto-incremented `version` is *replaced* by the run UID (`{save_dir_basename}/{name}/{run_uid}`). Resulting key layout (with run UID active):: {base}/checkpoints/epoch=0-step=100.ckpt {base}/checkpoints/hparams.yaml {base}/config.yaml If available in the database through `save_lightning_features()`, the following `lamindb.lightning` features are automatically tracked: - Artifact-level: `is_best_model`, `is_last_model`, `score`, `model_rank`, `save_weights_only`, `monitor`, `mode` - Run-level: `logger_name`, `logger_version`, `max_epochs`, `max_steps`, `precision`, `accumulate_grad_batches`, `gradient_clip_val`, `monitor`, `mode` Additionally, model hyperparameters (from `pl_module.hparams`) and datamodule hyperparameters (from `trainer.datamodule.hparams`) are captured if corresponding features exist. This is the concrete LaminDB implementation built on top of :class:`ArtifactPublishingModelCheckpoint`. Use it when you want LaminDB to be the persistence layer. For secondary systems such as ClearML, prefer attaching an :class:`ArtifactObserver` or subclassing :class:`Checkpoint` and reacting in :meth:`on_artifact_saved`. Args: dirpath: Directory for checkpoints. When provided, also used as the artifact key prefix. When omitted (recommended), Lightning picks the local directory and the key prefix is derived from the logger. features: Features to annotate runs and artifacts. Use "run" key for run-level features (static metadata). Use "artifact" key for artifact-level features (values can be static or None for auto-population from trainer metrics/attributes). monitor: Quantity to monitor for saving best checkpoint. verbose: Verbosity mode. save_last: Save a copy of the last checkpoint. save_top_k: Number of best checkpoints to keep. save_weights_only: Save only model weights (not optimizer state). mode: One of "min" or "max" for monitor comparison. auto_insert_metric_name: Include metric name in checkpoint filename. every_n_train_steps: Checkpoint every N training steps. train_time_interval: Checkpoint at time intervals. every_n_epochs: Checkpoint every N epochs. save_on_train_epoch_end: Run checkpointing at end of training epoch. enable_version_counter: Append version to filename to avoid collisions. run_uid_is_version: When `True` (default) and a Lamin run context is active, incorporate the run UID into the base prefix. For the logger case the logger's auto-incremented version is replaced; for the dirpath and no-logger cases the run UID is appended as an extra path segment. Prevents cross-run key collisions. artifact_observers: Optional observer objects notified when checkpoint, config, or hparams artifacts are saved or when checkpoint files are removed locally. Observers follow :class:`ArtifactObserver` and receive :class:`ArtifactSavedEvent` and :class:`ArtifactRemovedEvent`. Examples: Let Lightning decide where to store checkpoints (recommended):: import lightning as pl from lightning.pytorch.loggers import CSVLogger from lamindb.integrations import lightning as ll ll.save_lightning_features() callback = ll.Checkpoint(monitor="val_loss", save_top_k=3) logger = CSVLogger(save_dir="logs") trainer = pl.Trainer(callbacks=[callback], logger=logger) trainer.fit(model, dataloader) # Query checkpoints — key prefix is derived from the logger # e.g. "logs/lightning_logs/version_0/checkpoints/" ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix) Explicit `dirpath` for full control over the artifact key prefix:: callback = ll.Checkpoint( dirpath="deployments/my_model/", monitor="val_loss", save_top_k=3, ) trainer = pl.Trainer(callbacks=[callback]) trainer.fit(model, dataloader) # Query checkpoints ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix) Using the CLI:: # config.yaml trainer: callbacks: - class_path: lamindb.integrations.lightning.Checkpoint init_args: monitor: val_loss save_top_k: 3 # Run with: # python main.py fit --config config.yaml For more, see the guide: :doc:`lightning`. """ def __init__( self, dirpath: _PATH | None = None, *, features: dict[Literal["run", "artifact"], dict[str, Any]] | None = None, monitor: str | None = None, verbose: bool = False, save_last: bool | None = None, save_top_k: int = 1, save_weights_only: bool = False, mode: Literal["min", "max"] = "min", auto_insert_metric_name: bool = True, every_n_train_steps: int | None = None, train_time_interval: timedelta | None = None, every_n_epochs: int | None = None, save_on_train_epoch_end: bool | None = None, enable_version_counter: bool = True, run_uid_is_version: bool = True, artifact_observers: list[ArtifactObserver] | None = None, ) -> None: self._original_dirpath = dirpath super().__init__( dirpath=dirpath, monitor=monitor, verbose=verbose, save_last=save_last, save_top_k=save_top_k, save_weights_only=save_weights_only, mode=mode, auto_insert_metric_name=auto_insert_metric_name, every_n_train_steps=every_n_train_steps, train_time_interval=train_time_interval, every_n_epochs=every_n_epochs, save_on_train_epoch_end=save_on_train_epoch_end, enable_version_counter=enable_version_counter, artifact_observers=artifact_observers, ) self._feature_annotator = FeatureAnnotator(features) self._hparams_yaml_saved = False self._run_uid_is_version = run_uid_is_version self._trainer: pl.Trainer | None = None self._artifact_publisher: ArtifactPublisher = LaminArtifactPublisher() def setup( self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str ) -> None: """Validate user features and detect available auto-features.""" super().setup(trainer, pl_module, stage) self._trainer = trainer if self.save_last: warnings.warn( "save_last is not necessary with Lamin. Checkpoint metadata" " (is_best_model, is_last_model, model_rank, score) makes the latest checkpoint" " queryable without encoding this in the filename. Consider" " disabling save_last to avoid redundant checkpoint copies.", UserWarning, stacklevel=2, ) if trainer.is_global_zero: self._feature_annotator.setup(trainer, pl_module) def _base_prefix(self, trainer: pl.Trainer) -> str: """Compute the base artifact key prefix. The base prefix is the root namespace for all artifacts produced by this callback. Checkpoints live under `{base}/checkpoints/` and other files (config, hparams) directly under `{base}/`. Priority: explicit `dirpath` > logger > run UID > empty. """ run_uid = self._active_run_uid() if self._original_dirpath is not None: prefix = str(self._original_dirpath).rstrip("/") return f"{prefix}/{run_uid}" if run_uid else prefix if len(trainer.loggers) > 0: return self._logger_prefix(trainer, run_uid) return run_uid or "" def _active_run_uid(self) -> str | None: """Return the Lamin run UID when run-UID scoping is active.""" if self._run_uid_is_version and ln.context.run is not None: return ln.context.run.uid return None def _logger_prefix(self, trainer: pl.Trainer, run_uid: str | None) -> str: """Derive a key prefix from the trainer's first logger.""" assert trainer.loggers, "_logger_prefix requires at least one logger" logger = trainer.loggers[0] save_dir = logger.save_dir or trainer.default_root_dir name = str(logger.name).rstrip("/") if run_uid: version = run_uid else: version = logger.version version = version if isinstance(version, str) else f"version_{version}" return f"{Path(save_dir).name}/{name}/{version.rstrip('/')}" @property def base_prefix(self) -> str: """The base artifact key prefix for all artifacts from this callback. Checkpoints live under `{base_prefix}/checkpoints/` and configs directly under `{base_prefix}/`. Available after `setup()` has been called. """ assert self._trainer is not None, "base_prefix is only available after setup()" return self._base_prefix(self._trainer) @property def checkpoint_key_prefix(self) -> str: """The artifact key prefix used for checkpoint artifacts. Available after `setup()` has been called, for example once `trainer.fit()` has started. """ base = self.base_prefix return f"{base}/checkpoints" if base else "checkpoints" def resolve_artifact_storage_uri(self, artifact: ln.Artifact) -> str: """Resolve the physical artifact location for downstream registries. This is the stable abstraction external packages should use instead of reconstructing storage locations from Lamin internals. """ return self._artifact_publisher.storage_uri(artifact) def resolve_artifact_key( self, trainer: pl.Trainer, filepath: Path | str, kind: ArtifactKind, ) -> str: """Return the Lamin artifact key for a checkpoint-related file.""" base = self._base_prefix(trainer) if kind in {"checkpoint", "hparams"}: prefix = f"{base}/checkpoints" if base else "checkpoints" else: prefix = base if prefix: return f"{prefix}/{Path(filepath).name}" return Path(filepath).name def _create_lamin_artifact( self, local_path: Path | str, *, key: str, description: str, kind: str | None = None, add_as_input_to_run: bool = False, skip_hash_lookup: bool = False, ) -> ln.Artifact: return self._artifact_publisher.create_artifact( local_path, key=key, description=description, kind=kind, add_as_input_to_run=add_as_input_to_run, skip_hash_lookup=skip_hash_lookup, ) self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix) def save_checkpoint_artifact( self, trainer: pl.Trainer, filepath: Path | str, *, feature_values: dict[str | ln.Feature, Any] | None = None, ) -> ln.Artifact: """Save a checkpoint artifact to Lamin and emit the corresponding event. This is the main persistence hook used by :meth:`_save_checkpoint`. It is a useful override point for subclasses that want to augment Lamin persistence while keeping the generic lifecycle behavior from the base class. """ key = self.resolve_artifact_key( trainer=trainer, filepath=filepath, kind="checkpoint" ) existing_artifact = ln.Artifact.filter(key=key).one_or_none() if existing_artifact is not None: existing_artifact.delete(permanent=True, storage=True) artifact = self._create_lamin_artifact( filepath, key=key, description="model checkpoint", kind="model", skip_hash_lookup=True, ) if feature_values: artifact.features.add_values(feature_values) self._notify_artifact_saved( trainer, kind="checkpoint", key=key, artifact=artifact, local_path=filepath, ) return artifact def save_config_artifact( self, trainer: pl.Trainer, config_path: Path | str ) -> ln.Artifact: """Save a Lightning CLI config artifact and emit the corresponding event. Config artifacts are routed through the same lifecycle surface as checkpoints so observers and subclasses see a unified event stream. """ key = self.resolve_artifact_key( trainer=trainer, filepath=config_path, kind="config" ) artifact = self._create_lamin_artifact( config_path, key=key, description="Lightning CLI config", kind="config", add_as_input_to_run=True, skip_hash_lookup=True, ) self._notify_artifact_saved( trainer, kind="config", key=key, artifact=artifact, local_path=config_path, ) return artifact def save_hparams_artifact( self, trainer: pl.Trainer, hparams_path: Path | str ) -> ln.Artifact | None: """Save Lightning's auto-generated hparams file and emit the event. Returns `None` if Lightning did not generate `hparams.yaml` for the current run. """ if not Path(hparams_path).exists(): return None key = self.resolve_artifact_key( trainer=trainer, filepath=hparams_path, kind="hparams" ) artifact = self._create_lamin_artifact( hparams_path, key=key, description="Lightning run hyperparameters", kind="config", skip_hash_lookup=True, ) self._notify_artifact_saved( trainer, kind="hparams", key=key, artifact=artifact, local_path=hparams_path, ) return artifact def _save_hparams_yaml(self, trainer: pl.Trainer) -> None: """Persist Lightning's auto-generated hparams file once per run.""" if self._hparams_yaml_saved: return log_dir = trainer.log_dir if not log_dir: return hparams_path = Path(log_dir) / "hparams.yaml" if not hparams_path.exists(): return if self.save_hparams_artifact(trainer, hparams_path) is not None: self._hparams_yaml_saved = True def _save_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None: """Save checkpoint to the instance.""" super()._save_checkpoint(trainer, filepath) if not trainer.is_global_zero: return self._save_hparams_yaml(trainer) self._feature_annotator.save_run_features( trainer, monitor=self.monitor, mode=self.mode ) self._feature_annotator.clear_last_model_flags(self.checkpoint_key_prefix) is_best = self.best_model_path == str(filepath) feature_values = self._feature_annotator.collect_checkpoint_features( trainer, is_best=is_best, current_score=self.current_score, save_weights_only=self.save_weights_only, monitor=self.monitor, mode=self.mode, ) if is_best: self._feature_annotator.clear_best_model_flags(self.checkpoint_key_prefix) self.save_checkpoint_artifact(trainer, filepath, feature_values=feature_values) self._feature_annotator.update_model_ranks( self.checkpoint_key_prefix, mode=self.mode ) def _remove_checkpoint(self, trainer: pl.Trainer, filepath: str) -> None: """Remove the local checkpoint file and emit a removal event.""" artifact: ln.Artifact | None = None key = self.resolve_artifact_key( trainer=trainer, filepath=filepath, kind="checkpoint" ) if trainer.is_global_zero: artifact = ln.Artifact.filter(key=key).one_or_none() super()._remove_checkpoint(trainer, filepath) if trainer.is_global_zero: self._notify_artifact_removed( trainer, kind="checkpoint", key=key, local_path=filepath, artifact=artifact, ) if artifact is not None: artifact.delete(permanent=True, storage=True) class SaveConfigCallback(_SaveConfigCallback): """SaveConfigCallback that also saves config to the instance. Use with LightningCLI to save the resolved configuration file alongside checkpoints. The local config file is saved under `{save_dir}/{name}/{version}/` derived from the first logger, avoiding Lightning's `trainer.log_dir` which hardcodes an `isinstance` check for `TensorBoardLogger` / `CSVLogger` and silently changes the directory for other loggers. This callback looks for any :class:`ArtifactPublishingModelCheckpoint`, not just Lamin's concrete :class:`Checkpoint`. That keeps the config-save path aligned with custom subclasses built on the generic artifact-publishing base. Config artifacts are stored directly under the **base prefix** of the active :class:`Checkpoint` callback. The base prefix follows the same derivation rules as for checkpoints (dirpath > logger > empty), so configs are always co-located with their checkpoints: - `Checkpoint.dirpath` set → `{dirpath}/config.yaml` (`{dirpath}/{run_uid}/config.yaml` with run-UID scoping) - Logger present, no `dirpath` → `{save_dir_basename}/{name}/{version}/config.yaml` - Neither → `config.yaml` (or `{run_uid}/config.yaml` with run-UID scoping) Example:: from lightning.pytorch.cli import LightningCLI from lamindb.integrations import lightning as ll cli = LightningCLI( MyModel, MyDataModule, save_config_callback=ll.SaveConfigCallback, ) """ def setup( self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str ) -> None: """Save resolved configuration file alongside checkpoints.""" if self.already_saved: # type: ignore return if self.save_to_log_dir: config_path = self._config_path(trainer) if not self.overwrite: file_exists = config_path.exists() if trainer.is_global_zero else False file_exists = trainer.strategy.broadcast(file_exists) if file_exists: raise RuntimeError(f"Config file already exists: {config_path}") if trainer.is_global_zero: config_path.parent.mkdir(exist_ok=True, parents=True) self.parser.save( self.config, config_path, skip_none=False, overwrite=self.overwrite, multifile=self.multifile, ) self._save_config(trainer, config_path) if trainer.is_global_zero: self.save_config(trainer, pl_module, stage) self.already_saved = True self.already_saved = trainer.strategy.broadcast(self.already_saved) def _config_path(self, trainer: pl.Trainer) -> Path: """Derive the local config file path from the first logger. We intentionally avoid `trainer.log_dir` because Lightning hardcodes an `isinstance` check against `TensorBoardLogger` and `CSVLogger` there. For those two loggers it uses `logger.log_dir` (which appends name/version), while for every other logger it falls back to `logger.save_dir` (no name/version). This means the config file location silently changes depending on which logger happens to be first — making it unpredictable for third-party loggers. This method always uses `logger.save_dir` + `name` + `version`, giving a consistent directory layout regardless of logger type. """ if len(trainer.loggers) > 0: first = trainer.loggers[0] save_dir = ( first.save_dir if first.save_dir is not None else trainer.default_root_dir ) name = first.name version = first.version version = version if isinstance(version, str) else f"version_{version}" return Path(save_dir) / str(name) / version / self.config_filename return Path(trainer.default_root_dir) / self.config_filename def _save_config(self, trainer: pl.Trainer, config_path: Path) -> None: """Persist the resolved config through the active artifact checkpoint. If no artifact-publishing checkpoint callback is registered, this becomes a no-op and only Lightning's local config file is written. """ checkpoint_cb = self._get_artifact_checkpoint_callback(trainer) if checkpoint_cb is None: return checkpoint_cb.save_config_artifact(trainer, config_path) def _get_artifact_checkpoint_callback( self, trainer: pl.Trainer ) -> ArtifactPublishingModelCheckpoint | None: """Find the artifact-publishing checkpoint callback if present.""" for cb in trainer.callbacks: if isinstance(cb, ArtifactPublishingModelCheckpoint): return cb return None # backwards compatibility # We keep the full class around because it's short and it's cumbersome to write # full backwards compatibility code because of the rather different interfaces and behavior class Callback(pl.Callback): """Saves checkpoints to LaminDB after each training epoch. .. deprecated:: Use :class:`Checkpoint` instead for new code. Args: path: A local path to the checkpoint. key: The `key` for the checkpoint artifact. features: Features to annotate the checkpoint. """ def __init__( self, path: str | Path, key: str, features: dict[str, Any] | None = None, ): warnings.warn( "ll.Callback is deprecated, use ll.Checkpoint instead", DeprecationWarning, stacklevel=2, ) self.path = Path(path) self.key = key self.features = features or {} def on_train_start( self, trainer: pl.Trainer, pl_module: pl.LightningModule ) -> None: """Validates that features exist for all specified params.""" missing = [ name for name in self.features if ln.Feature.filter(name=name).one_or_none() is None ] if missing: s = "s" if len(missing) > 1 else "" raise ValueError( f"Feature{s} {', '.join(missing)} missing. " f"Create {'them' if len(missing) > 1 else 'it'} first." ) def on_train_epoch_end( self, trainer: pl.Trainer, pl_module: pl.LightningModule ) -> None: """Saves model checkpoint at the end of each epoch.""" trainer.save_checkpoint(self.path) artifact = ln.Artifact(self.path, key=self.key, kind="model").save() feature_values = dict(self.features) for name in self.features: if hasattr(trainer, name): feature_values[name] = getattr(trainer, name) elif name in trainer.callback_metrics: metric = trainer.callback_metrics[name] feature_values[name] = ( metric.item() if hasattr(metric, "item") else float(metric) ) if feature_values: artifact.features.add_values(feature_values) __all__ = [ "ArtifactObserver", "ArtifactEvent", "ArtifactPublisher", "ArtifactPublishingModelCheckpoint", "ArtifactRemovedEvent", "ArtifactSavedEvent", "Checkpoint", "LaminArtifactPublisher", "SaveConfigCallback", "save_lightning_features", ] ================================================ FILE: lamindb/migrations/0177_squashed.py ================================================ # Generated by Django 5.2 on 2026-01-10 23:06 import django.core.validators import django.db.models.deletion import django.db.models.functions.datetime import django.db.models.functions.text import pgtrigger.compiler import pgtrigger.migrations from django.db import connection, migrations, models import lamindb.base.fields import lamindb.base.uids import lamindb.base.users import lamindb.models.can_curate import lamindb.models.has_parents import lamindb.models.run import lamindb.models.sqlrecord CREATE_IS_VALID_RECORD_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN) RETURNS BOOLEAN AS $$ BEGIN -- Record with no type is valid IF record_type_id IS NULL THEN RETURN TRUE; END IF; -- If current record is a type, it can only reference schema-less types IF record_is_type THEN RETURN EXISTS ( SELECT 1 FROM lamindb_record r WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL ); END IF; -- Regular records can reference any type RETURN EXISTS ( SELECT 1 FROM lamindb_record r WHERE r.id = record_type_id AND r.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_record ADD CONSTRAINT record_type_is_valid_fk CHECK (is_valid_record_type(type_id, is_type)); """ CREATE_IS_VALID_FEATURE_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN -- Feature with no type is valid IF feature_type_id IS NULL THEN RETURN TRUE; END IF; -- Type must have is_type = TRUE RETURN EXISTS ( SELECT 1 FROM lamindb_feature f WHERE f.id = feature_type_id AND f.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_feature ADD CONSTRAINT feature_type_is_valid_fk CHECK (is_valid_feature_type(type_id)); """ CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF schema_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_schema s WHERE s.id = schema_type_id AND s.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_schema ADD CONSTRAINT schema_type_is_valid_fk CHECK (is_valid_schema_type(type_id)); """ CREATE_IS_VALID_PROJECT_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF project_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_project p WHERE p.id = project_type_id AND p.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_project ADD CONSTRAINT project_type_is_valid_fk CHECK (is_valid_project_type(type_id)); """ CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF reference_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_reference r WHERE r.id = reference_type_id AND r.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_reference ADD CONSTRAINT reference_type_is_valid_fk CHECK (is_valid_reference_type(type_id)); """ CREATE_IS_VALID_ULABEL_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF ulabel_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_ulabel u WHERE u.id = ulabel_type_id AND u.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_ulabel ADD CONSTRAINT ulabel_type_is_valid_fk CHECK (is_valid_ulabel_type(type_id)); """ def apply_constraints(apps, schema_editor): if schema_editor.connection.vendor == "postgresql": schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT) class Migration(migrations.Migration): initial = True dependencies = [] # type: ignore operations = [ migrations.CreateModel( name="Migration", fields=[ ( "id", models.BigAutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ( "app", lamindb.base.fields.CharField( blank=True, default=None, max_length=255 ), ), ( "name", lamindb.base.fields.CharField( blank=True, default=None, max_length=255 ), ), ("applied", lamindb.base.fields.DateTimeField(blank=True)), ], options={ "db_table": "django_migrations", "managed": False, }, ), migrations.CreateModel( name="Block", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ("key", models.CharField(db_index=True, max_length=1024)), ], ), migrations.CreateModel( name="Branch", fields=[ ("id", models.AutoField(primary_key=True, serialize=False)), ("name", models.CharField(db_index=True, max_length=100)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ], ), migrations.CreateModel( name="Space", fields=[ ("id", models.SmallAutoField(primary_key=True, serialize=False)), ("name", models.CharField(db_index=True, max_length=100)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ], ), migrations.CreateModel( name="Artifact", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=20, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024, null=True, ), ), ( "_real_key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024, null=True, ), ), ( "description", lamindb.base.fields.TextField( blank=True, db_index=True, default=None, null=True ), ), ( "suffix", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=30, ), ), ( "kind", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=20, null=True, ), ), ( "otype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=64, null=True, ), ), ( "size", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=22, null=True, ), ), ( "n_files", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "n_observations", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "_hash_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=30, null=True, ), ), ( "_key_is_virtual", lamindb.base.fields.BooleanField(blank=True, default=None), ), ( "_overwrite_versions", lamindb.base.fields.BooleanField(blank=True, default=None), ), ( "_actions", models.ManyToManyField( related_name="_action_targets", to="lamindb.artifact" ), ), ], options={ "abstract": False, }, ), migrations.CreateModel( name="ArtifactArtifact", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.artifact", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_value", to="lamindb.artifact", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="artifacts", field=models.ManyToManyField( related_name="linked_by_artifacts", through="lamindb.ArtifactArtifact", to="lamindb.artifact", ), ), migrations.CreateModel( name="BlockProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "block", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.block", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="block", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="artifact", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.CreateModel( name="Collection", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_20, editable=False, max_length=20, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "_actions", models.ManyToManyField(related_name="+", to="lamindb.artifact"), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "meta_artifact", lamindb.base.fields.OneToOneField( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_meta_of_collection", to="lamindb.artifact", ), ), ], options={ "abstract": False, }, ), migrations.CreateModel( name="CollectionArtifact", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.artifact", ), ), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="collection", name="artifacts", field=models.ManyToManyField( related_name="collections", through="lamindb.CollectionArtifact", to="lamindb.artifact", ), ), migrations.CreateModel( name="CollectionProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="CollectionReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="Feature", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150 ), ), ( "_dtype_str", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "unit", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ("array_rank", models.SmallIntegerField(db_index=True, default=0)), ("array_size", models.IntegerField(db_index=True, default=0)), ( "array_shape", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "synonyms", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "default_value", lamindb.base.fields.JSONField(blank=True, default=None, null=True), ), ( "nullable", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "coerce", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="features", to="lamindb.feature", ), ), ], options={ "abstract": False, }, bases=(lamindb.models.can_curate.CanCurate, models.Model), ), migrations.CreateModel( name="CollectionRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.collection", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collectionrecord", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactRun", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_run", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactrun", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactreference", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactrecord", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactproject", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifactartifact", name="feature", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactartifact", to="lamindb.feature", ), ), migrations.CreateModel( name="FeatureProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="JsonValue", fields=[ ( "id", models.BigAutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("value", models.JSONField()), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="values", to="lamindb.feature", ), ), ], options={ "abstract": False, "base_manager_name": "objects", }, ), migrations.CreateModel( name="ArtifactJsonValue", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_jsonvalue", to="lamindb.artifact", ), ), ( "jsonvalue", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.jsonvalue", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="json_values", field=models.ManyToManyField( related_name="artifacts", through="lamindb.ArtifactJsonValue", to="lamindb.jsonvalue", ), ), migrations.CreateModel( name="Project", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "abbr", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32, null=True, ), ), ( "url", lamindb.base.fields.URLField( blank=True, default=None, max_length=255, null=True ), ), ( "start_date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ( "end_date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ("_status_code", models.SmallIntegerField(db_index=True, default=0)), ( "artifacts", models.ManyToManyField( related_name="projects", through="lamindb.ArtifactProject", to="lamindb.artifact", ), ), ( "blocks", models.ManyToManyField( related_name="projects", through="lamindb.BlockProject", to="lamindb.block", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="projects", through="lamindb.CollectionProject", to="lamindb.collection", ), ), ( "features", models.ManyToManyField( related_name="projects", through="lamindb.FeatureProject", to="lamindb.feature", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.project" ), ), ( "predecessors", models.ManyToManyField( related_name="successors", to="lamindb.project" ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="projects", to="lamindb.project", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.can_curate.CanCurate, models.Model, lamindb.models.sqlrecord.ValidateFields, ), ), migrations.AddField( model_name="featureproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_feature", to="lamindb.project", ), ), migrations.AddField( model_name="collectionproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.project", ), ), migrations.AddField( model_name="blockproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_block", to="lamindb.project", ), ), migrations.AddField( model_name="artifactproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.project", ), ), migrations.CreateModel( name="Record", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=16, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ("extra_data", models.JSONField(null=True)), ( "artifacts", models.ManyToManyField( related_name="records", through="lamindb.ArtifactRecord", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="records", through="lamindb.CollectionRecord", to="lamindb.collection", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.record" ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="records", to="lamindb.record", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.has_parents.HasParents, lamindb.models.can_curate.CanCurate, models.Model, ), ), migrations.CreateModel( name="ProjectRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_projectrecord", to="lamindb.feature", ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.project", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.record", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="records", field=models.ManyToManyField( related_name="projects", through="lamindb.ProjectRecord", to="lamindb.record", ), ), migrations.AddField( model_name="collectionrecord", name="record", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.record", ), ), migrations.AddField( model_name="artifactrecord", name="record", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.record", ), ), migrations.CreateModel( name="RecordArtifact", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordartifact", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_artifact", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.artifact", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="artifact", name="linked_in_records", field=models.ManyToManyField( related_name="linked_artifacts", through="lamindb.RecordArtifact", to="lamindb.record", ), ), migrations.CreateModel( name="RecordCollection", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordcollection", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_collection", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.collection", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="collection", name="linked_in_records", field=models.ManyToManyField( related_name="linked_collections", through="lamindb.RecordCollection", to="lamindb.record", ), ), migrations.CreateModel( name="RecordProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordproject", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_project", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.project", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="project", name="linked_in_records", field=models.ManyToManyField( related_name="linked_projects", through="lamindb.RecordProject", to="lamindb.record", ), ), migrations.CreateModel( name="RecordRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_record", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.record", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="record", name="linked_records", field=models.ManyToManyField( related_name="linked_in_records", through="lamindb.RecordRecord", to="lamindb.record", ), ), migrations.CreateModel( name="RecordReference", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordreference", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_reference", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordRun", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordrun", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_run", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordTransform", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordtransform", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_transform", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordulabel", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_ulabel", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recorduser", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_user", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="Reference", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "abbr", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32, null=True, ), ), ( "url", lamindb.base.fields.URLField(blank=True, db_index=True, null=True), ), ( "pubmed_id", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, null=True ), ), ( "doi", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, validators=[ django.core.validators.RegexValidator( message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)", regex="^(?:https?://(?:dx\\.)?doi\\.org/|doi:|DOI:)?10\\.\\d+/.*$", ) ], ), ), ( "text", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ( "artifacts", models.ManyToManyField( related_name="references", through="lamindb.ArtifactReference", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="references", through="lamindb.CollectionReference", to="lamindb.collection", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_references", through="lamindb.RecordReference", to="lamindb.record", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="references", to="lamindb.reference", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.can_curate.CanCurate, models.Model, lamindb.models.sqlrecord.ValidateFields, ), ), migrations.AddField( model_name="recordreference", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.reference", ), ), migrations.AddField( model_name="project", name="references", field=models.ManyToManyField( related_name="projects", to="lamindb.reference" ), ), migrations.AddField( model_name="collectionreference", name="reference", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.reference", ), ), migrations.AddField( model_name="artifactreference", name="reference", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.reference", ), ), migrations.CreateModel( name="ReferenceRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_referencerecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.record", ), ), ( "reference", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.reference", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="reference", name="records", field=models.ManyToManyField( related_name="references", through="lamindb.ReferenceRecord", to="lamindb.record", ), ), migrations.CreateModel( name="Run", fields=[ ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "entrypoint", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "started_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "finished_at", lamindb.base.fields.DateTimeField( blank=True, db_index=True, default=None, null=True ), ), ("params", models.JSONField(null=True)), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "cli_args", lamindb.base.fields.CharField( blank=True, default=None, max_length=1024, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_is_consecutive", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "_status_code", models.SmallIntegerField( db_default=-3, db_index=True, default=-3, null=True ), ), ( "artifacts", models.ManyToManyField( related_name="runs", through="lamindb.ArtifactRun", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "environment", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_environment_of", to="lamindb.artifact", ), ), ( "initiated_by_run", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="initiated_runs", to="lamindb.run", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_runs", through="lamindb.RecordRun", to="lamindb.record", ), ), ( "report", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_report_of", to="lamindb.artifact", ), ), ], ), migrations.AddField( model_name="referencerecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="reference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="recordrun", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.run", ), ), migrations.AddField( model_name="record", name="input_of_runs", field=models.ManyToManyField( related_name="input_records", to="lamindb.run" ), ), migrations.AddField( model_name="record", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, editable=False, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_records", to="lamindb.run", ), ), migrations.AddField( model_name="projectrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="project", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="jsonvalue", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="featureproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="feature", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionreference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionartifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collection", name="input_of_runs", field=models.ManyToManyField( related_name="input_collections", to="lamindb.run" ), ), migrations.AddField( model_name="collection", name="recreating_runs", field=models.ManyToManyField( related_name="recreated_collections", to="lamindb.run" ), ), migrations.AddField( model_name="collection", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_collections", to="lamindb.run", ), ), migrations.AddField( model_name="blockproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.CreateModel( name="ArtifactUser", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactuser", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactulabel", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifactrun", name="run", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.run", ), ), migrations.AddField( model_name="artifactreference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactjsonvalue", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactartifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifact", name="input_of_runs", field=models.ManyToManyField( related_name="input_artifacts", to="lamindb.run" ), ), migrations.AddField( model_name="artifact", name="recreating_runs", field=models.ManyToManyField( related_name="recreated_artifacts", to="lamindb.run" ), ), migrations.AddField( model_name="artifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=None, editable=False, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_artifacts", to="lamindb.run", ), ), migrations.CreateModel( name="RunJsonValue", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "jsonvalue", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.jsonvalue", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_jsonvalue", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="json_values", field=models.ManyToManyField( related_name="runs", through="lamindb.RunJsonValue", to="lamindb.jsonvalue", ), ), migrations.CreateModel( name="RunProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="project", name="runs", field=models.ManyToManyField( related_name="projects", through="lamindb.RunProject", to="lamindb.run" ), ), migrations.CreateModel( name="RunRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.record", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="record", name="runs", field=models.ManyToManyField( related_name="records", through="lamindb.RunRecord", to="lamindb.run" ), ), migrations.CreateModel( name="Schema", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=16, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "n_members", lamindb.base.fields.IntegerField( blank=True, default=None, null=True ), ), ( "coerce", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "flexible", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "itype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=120, null=True, ), ), ( "otype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=64, null=True, ), ), ( "_dtype_str", lamindb.base.fields.CharField( blank=True, default=None, editable=False, max_length=64, null=True, ), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=22, null=True, ), ), ( "minimal_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True, editable=False ), ), ( "ordered_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=False, editable=False ), ), ( "maximal_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=False, editable=False ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="schemas", to="lamindb.schema", ), ), ], options={ "abstract": False, }, bases=(lamindb.models.can_curate.CanCurate, models.Model), ), migrations.AddField( model_name="record", name="schema", field=lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="records", to="lamindb.schema", ), ), migrations.CreateModel( name="ArtifactSchema", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "slot", lamindb.base.fields.CharField( blank=True, default=None, max_length=255, null=True ), ), ( "feature_ref_is_semantic", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="_links_schema", to="lamindb.artifact", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="_links_artifact", to="lamindb.schema", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="schema", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="validated_artifacts", to="lamindb.schema", ), ), migrations.AddField( model_name="artifact", name="schemas", field=models.ManyToManyField( related_name="artifacts", through="lamindb.ArtifactSchema", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaComponent", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "slot", lamindb.base.fields.CharField( blank=True, default=None, max_length=255, null=True ), ), ( "component", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_composite", to="lamindb.schema", ), ), ( "composite", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_component", to="lamindb.schema", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="schema", name="components", field=models.ManyToManyField( related_name="composites", through="lamindb.SchemaComponent", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaFeature", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_schema", to="lamindb.feature", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_feature", to="lamindb.schema", ), ), ], options={ "unique_together": {("schema", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="feature", name="schemas", field=models.ManyToManyField( related_name="features", through="lamindb.SchemaFeature", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_schema", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.schema", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="schemas", field=models.ManyToManyField( related_name="projects", through="lamindb.SchemaProject", to="lamindb.schema", ), ), migrations.AddField( model_name="schema", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="run", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="reference", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="record", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="project", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="jsonvalue", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="feature", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="collection", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="branch", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="block", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="artifact", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.CreateModel( name="Storage", fields=[ ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "root", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30 ), ), ( "region", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=64, null=True, ), ), ( "instance_uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=12, null=True, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ], options={ "abstract": False, }, ), migrations.AddField( model_name="artifact", name="storage", field=lamindb.base.fields.ForeignKey( blank=True, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="artifacts", to="lamindb.storage", ), ), migrations.CreateModel( name="Transform", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=16, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024 ), ), ( "description", lamindb.base.fields.TextField( blank=True, db_index=True, default=None, null=True ), ), ( "kind", lamindb.base.fields.CharField( blank=True, db_index=True, default="pipeline", max_length=20 ), ), ( "source_code", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "environment", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="_environment_of_transforms", to="lamindb.artifact", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_transforms", through="lamindb.RecordTransform", to="lamindb.record", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ], options={ "abstract": False, }, ), migrations.AddField( model_name="run", name="transform", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="runs", to="lamindb.transform", ), ), migrations.AddField( model_name="recordtransform", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="transforms", field=models.ManyToManyField( related_name="projects", through="lamindb.TransformProject", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), editable=False, ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transformrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.record", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="record", name="transforms", field=models.ManyToManyField( related_name="records", through="lamindb.TransformRecord", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "reference", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.reference", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="reference", name="transforms", field=models.ManyToManyField( related_name="references", through="lamindb.TransformReference", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformTransform", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("config", models.JSONField(default=None, null=True)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), editable=False, ), ), ( "predecessor", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_successor", to="lamindb.transform", ), ), ( "successor", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_predecessor", to="lamindb.transform", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="transform", name="predecessors", field=models.ManyToManyField( related_name="successors", through="lamindb.TransformTransform", to="lamindb.transform", ), ), migrations.CreateModel( name="ULabel", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_8, editable=False, max_length=8, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "artifacts", models.ManyToManyField( related_name="ulabels", through="lamindb.ArtifactULabel", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_column="branch_id", db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_ulabels", through="lamindb.RecordULabel", to="lamindb.record", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.ulabel" ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="ulabels", to="lamindb.ulabel", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.has_parents.HasParents, lamindb.models.can_curate.CanCurate, models.Model, ), ), migrations.CreateModel( name="TransformULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.transform", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="transform", name="ulabels", field=models.ManyToManyField( related_name="transforms", through="lamindb.TransformULabel", to="lamindb.ulabel", ), ), migrations.CreateModel( name="RunULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.ulabel", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="ulabels", field=models.ManyToManyField( related_name="runs", through="lamindb.RunULabel", to="lamindb.ulabel" ), ), migrations.AddField( model_name="recordulabel", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.ulabel", ), ), migrations.CreateModel( name="CollectionULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.collection", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collectionulabel", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="collection", name="ulabels", field=models.ManyToManyField( related_name="collections", through="lamindb.CollectionULabel", to="lamindb.ulabel", ), ), migrations.AddField( model_name="artifactulabel", name="ulabel", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.ulabel", ), ), migrations.CreateModel( name="ULabelProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_ulabel", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="ulabels", field=models.ManyToManyField( related_name="projects", through="lamindb.ULabelProject", to="lamindb.ulabel", ), ), migrations.CreateModel( name="User", fields=[ ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=8, unique=True, ), ), ( "handle", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "artifacts", models.ManyToManyField( related_name="users", through="lamindb.ArtifactUser", through_fields=("user", "artifact"), to="lamindb.artifact", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_users", through="lamindb.RecordUser", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.can_curate.CanCurate), ), migrations.AddField( model_name="ulabelproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ULabelBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "ulabel", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.ulabel", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="ulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformtransform", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="TransformBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ("line_number", models.IntegerField(null=True)), ( "transform", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.transform", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="transform", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="created_transforms", to="lamindb.user", ), ), migrations.AddField( model_name="storage", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="SpaceBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "space", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.space", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="space", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="schemaproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="schemacomponent", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="SchemaBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "schema", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.schema", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="schema", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runjsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="RunBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "run", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.run", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="run", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.CASCADE, related_name="created_runs", to="lamindb.user", ), ), migrations.AddField( model_name="referencerecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="reference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="recorduser", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.user", ), ), migrations.CreateModel( name="RecordBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "record", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.record", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="record", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="projectrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ProjectBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "project", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.project", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="project", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="jsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="featureproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="FeatureBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "feature", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.feature", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="feature", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="CollectionBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "collection", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.collection", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="collectionartifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collection", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="BranchBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.branch", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="branch", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="blockproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="block", name="created_by", field=models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactuser", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactuser", name="user", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.user", ), ), migrations.AddField( model_name="artifactulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactschema", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactrun", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactjsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ArtifactBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="mdpage", db_index=True, default="mdpage", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "artifact", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.artifact", ), ), ( "created_by", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="artifactartifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="created_artifacts", to="lamindb.user", ), ), migrations.CreateModel( name="RecordJson", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "value", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordjson", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_json", to="lamindb.record", ), ), ], options={ "unique_together": {("record", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AlterUniqueTogether( name="recordreference", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordrun", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordtransform", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordulabel", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="ulabelproject", unique_together={("ulabel", "project")}, ), migrations.AlterUniqueTogether( name="transformulabel", unique_together={("transform", "ulabel")}, ), migrations.AlterUniqueTogether( name="transformtransform", unique_together={("successor", "predecessor")}, ), migrations.AlterUniqueTogether( name="transformreference", unique_together={("transform", "reference")}, ), migrations.AlterUniqueTogether( name="transformrecord", unique_together={("transform", "record", "feature")}, ), migrations.AlterUniqueTogether( name="transformproject", unique_together={("transform", "project")}, ), migrations.AlterUniqueTogether( name="transform", unique_together={("key", "hash")}, ), migrations.AddConstraint( model_name="space", constraint=models.UniqueConstraint( django.db.models.functions.text.Lower("name"), name="unique_space_name_lower", ), ), migrations.AlterUniqueTogether( name="schemaproject", unique_together={("schema", "project")}, ), migrations.AlterUniqueTogether( name="schemacomponent", unique_together={("composite", "slot"), ("composite", "slot", "component")}, ), migrations.AlterUniqueTogether( name="runulabel", unique_together={("run", "ulabel")}, ), migrations.AlterUniqueTogether( name="runrecord", unique_together={("run", "record", "feature")}, ), migrations.AlterUniqueTogether( name="runproject", unique_together={("run", "project")}, ), migrations.AlterUniqueTogether( name="runjsonvalue", unique_together={("run", "jsonvalue")}, ), migrations.AlterUniqueTogether( name="referencerecord", unique_together={("reference", "feature", "record")}, ), migrations.AlterUniqueTogether( name="recorduser", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="projectrecord", unique_together={("project", "feature", "record")}, ), migrations.AlterUniqueTogether( name="jsonvalue", unique_together={("feature", "hash")}, ), migrations.AlterUniqueTogether( name="featureproject", unique_together={("feature", "project")}, ), migrations.AddConstraint( model_name="feature", constraint=models.CheckConstraint( condition=models.Q( ("is_type", True), ("_dtype_str__isnull", False), _connector="OR" ), name="feature_dtype_str_not_null_when_is_type_false", ), ), migrations.AlterUniqueTogether( name="collectionulabel", unique_together={("collection", "ulabel")}, ), migrations.AlterUniqueTogether( name="collectionreference", unique_together={("collection", "reference")}, ), migrations.AlterUniqueTogether( name="collectionrecord", unique_together={("collection", "record", "feature")}, ), migrations.AlterUniqueTogether( name="collectionproject", unique_together={("collection", "project")}, ), migrations.AlterUniqueTogether( name="collectionartifact", unique_together={("collection", "artifact")}, ), migrations.AddConstraint( model_name="collection", constraint=models.UniqueConstraint( fields=("key", "hash"), name="unique_collection_key_hash_not_null" ), ), migrations.AddConstraint( model_name="branch", constraint=models.UniqueConstraint( django.db.models.functions.text.Lower("name"), name="unique_branch_name_lower", ), ), migrations.AlterUniqueTogether( name="blockproject", unique_together={("block", "project")}, ), migrations.AlterUniqueTogether( name="artifactuser", unique_together={("artifact", "user", "feature")}, ), migrations.AlterUniqueTogether( name="artifactulabel", unique_together={("artifact", "ulabel", "feature")}, ), migrations.AlterUniqueTogether( name="artifactschema", unique_together={("artifact", "schema"), ("artifact", "slot")}, ), migrations.AlterUniqueTogether( name="artifactrun", unique_together={("artifact", "run", "feature")}, ), migrations.AlterUniqueTogether( name="artifactreference", unique_together={("artifact", "reference", "feature")}, ), migrations.AlterUniqueTogether( name="artifactrecord", unique_together={("artifact", "record", "feature")}, ), migrations.AlterUniqueTogether( name="artifactproject", unique_together={("artifact", "project", "feature")}, ), migrations.AlterUniqueTogether( name="artifactjsonvalue", unique_together={("artifact", "jsonvalue")}, ), migrations.AlterUniqueTogether( name="artifactartifact", unique_together={("artifact", "value", "feature")}, ), migrations.AddConstraint( model_name="artifact", constraint=models.UniqueConstraint( condition=models.Q(("key__isnull", False)), fields=("storage", "key", "hash"), name="unique_artifact_storage_key_hash_not_null", ), ), migrations.AddConstraint( model_name="artifact", constraint=models.UniqueConstraint( condition=models.Q(("key__isnull", True)), fields=("storage", "hash"), name="unique_artifact_storage_hash_null_key", ), ), migrations.RunPython(apply_constraints), ] if connection.vendor == "postgresql": Migration.operations += [ pgtrigger.migrations.AddTrigger( model_name="ulabel", trigger=pgtrigger.compiler.Trigger( name="prevent_ulabel_type_cycle", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (NEW.type_id IS NOT NULL)", func="\n -- Check for direct self-reference\n IF NEW.type_id = NEW.id THEN\n RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\n END IF;\n\n -- Check for cycles in the type chain\n IF EXISTS (\n WITH RECURSIVE type_chain AS (\n SELECT type_id, 1 as depth\n FROM lamindb_ulabel\n WHERE id = NEW.type_id\n\n UNION ALL\n\n SELECT r.type_id, tc.depth + 1\n FROM lamindb_ulabel r\n INNER JOIN type_chain tc ON r.id = tc.type_id\n WHERE tc.depth < 100\n )\n SELECT 1 FROM type_chain WHERE type_id = NEW.id\n ) THEN\n RAISE EXCEPTION 'Cannot set type: would create a cycle';\n END IF;\n\n RETURN NEW;\n ", hash="53487a8e36a64748418457f7229de6d5cf31e6bd", operation="UPDATE OR INSERT", pgid="pgtrigger_prevent_ulabel_type_cycle_863ae", table="lamindb_ulabel", when="BEFORE", ), ), ), pgtrigger.migrations.AddTrigger( model_name="record", trigger=pgtrigger.compiler.Trigger( name="prevent_record_type_cycle", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (NEW.type_id IS NOT NULL)", func="\n -- Check for direct self-reference\n IF NEW.type_id = NEW.id THEN\n RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\n END IF;\n\n -- Check for cycles in the type chain\n IF EXISTS (\n WITH RECURSIVE type_chain AS (\n SELECT type_id, 1 as depth\n FROM lamindb_record\n WHERE id = NEW.type_id\n\n UNION ALL\n\n SELECT r.type_id, tc.depth + 1\n FROM lamindb_record r\n INNER JOIN type_chain tc ON r.id = tc.type_id\n WHERE tc.depth < 100\n )\n SELECT 1 FROM type_chain WHERE type_id = NEW.id\n ) THEN\n RAISE EXCEPTION 'Cannot set type: would create a cycle';\n END IF;\n\n RETURN NEW;\n ", hash="deaab832a066dfec76228f5b7a62a08f334876a9", operation="UPDATE OR INSERT", pgid="pgtrigger_prevent_record_type_cycle_56c18", table="lamindb_record", when="BEFORE", ), ), ), pgtrigger.migrations.AddTrigger( model_name="feature", trigger=pgtrigger.compiler.Trigger( name="update_feature_on_name_change", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (OLD.name IS DISTINCT FROM NEW.name)", func="DECLARE\n old_renamed JSONB;\n new_renamed JSONB;\n ts TEXT;\nBEGIN\n -- Only proceed if name actually changed\n IF OLD.name IS DISTINCT FROM NEW.name THEN\n -- Update synonyms\n IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\n NEW.synonyms := OLD.name;\n ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\n NEW.synonyms := NEW.synonyms || '|' || OLD.name;\n END IF;\n\n -- Update _aux with rename history\n ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"');\n\n -- Get existing renamed history or initialize empty object\n old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\n\n -- Add old name with timestamp\n new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\n\n -- Update _aux with new renamed history\n IF NEW._aux IS NULL THEN\n NEW._aux := jsonb_build_object('renamed', new_renamed);\n ELSE\n NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\n END IF;\n END IF;\n\n RETURN NEW;\nEND;\n", hash="5f2e7a65e42c34b0455f0840def52f078726e401", operation="UPDATE", pgid="pgtrigger_update_feature_on_name_change_6c32d", table="lamindb_feature", when="BEFORE", ), ), ), ] ================================================ FILE: lamindb/migrations/0178_v2_2.py ================================================ # Generated by Django 5.2 on 2026-02-15 11:25 import django.db.models.deletion from django.db import migrations, models import lamindb.base.fields import lamindb.models.sqlrecord class Migration(migrations.Migration): dependencies = [ ("lamindb", "0177_squashed"), ] operations = [ migrations.AddField( model_name="artifact", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="block", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="collection", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="feature", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="jsonvalue", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="project", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="record", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="reference", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="run", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="run", name="description", field=lamindb.base.fields.TextField(blank=True, default=None, null=True), ), migrations.AddField( model_name="run", name="plan", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_plan_for_runs", to="lamindb.artifact", ), ), migrations.AddField( model_name="schema", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="storage", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="transform", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="transform", name="plan", field=models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="_plan_for_transforms", to="lamindb.artifact", ), ), migrations.AddField( model_name="ulabel", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="artifact", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="block", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="collection", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="feature", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="jsonvalue", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="project", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="record", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="reference", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="run", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="schema", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="storage", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="transform", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="ulabel", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.CreateModel( name="BranchPlan", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branchplan", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_branchplan", to="lamindb.branch", ), ), ], options={ "unique_together": {("branch", "artifact")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="plans", field=models.ManyToManyField( related_name="_plan_for_branches", through="lamindb.BranchPlan", to="lamindb.artifact", ), ), migrations.CreateModel( name="BranchProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.branch", ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.project", ), ), ], options={ "unique_together": {("branch", "project")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="projects", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchProject", to="lamindb.project", ), ), migrations.CreateModel( name="BranchULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.branch", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.ulabel", ), ), ], options={ "unique_together": {("branch", "ulabel")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="ulabels", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchULabel", to="lamindb.ulabel", ), ), migrations.CreateModel( name="BranchUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "role", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32 ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.branch", ), ), ( "user", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.user", ), ), ], options={ "unique_together": {("branch", "user", "role")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="users", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchUser", to="lamindb.user" ), ), migrations.CreateModel( name="ProjectUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "role", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32 ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.project", ), ), ( "user", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_project", to="lamindb.user", ), ), ], options={ "unique_together": {("project", "user", "role")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="project", name="users", field=models.ManyToManyField( related_name="projects", through="lamindb.ProjectUser", to="lamindb.user", ), ), migrations.CreateModel( name="RunArtifact", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runartifact", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runartifact", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_runartifact", to="lamindb.run", ), ), ], options={ "unique_together": {("run", "artifact", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="linked_artifacts", field=models.ManyToManyField( related_name="linked_by_runs", through="lamindb.RunArtifact", to="lamindb.artifact", ), ), ] ================================================ FILE: lamindb/migrations/0179_v2_2_part_2.py ================================================ # Generated by Django 5.2 on 2026-02-15 14:12 import django.db.models.deletion from django.db import migrations, models import lamindb.base.fields import lamindb.models.sqlrecord class Migration(migrations.Migration): dependencies = [ ("lamindb", "0178_v2_2"), ] operations = [ migrations.RemoveField( model_name="branch", name="plans", ), migrations.CreateModel( name="BranchArtifact", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.branch", ), ), ], options={ "unique_together": {("branch", "artifact")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="artifacts", field=models.ManyToManyField( related_name="linked_by_branches", through="lamindb.BranchArtifact", to="lamindb.artifact", ), ), migrations.DeleteModel( name="BranchPlan", ), ] ================================================ FILE: lamindb/migrations/0180_v2_2_part_3.py ================================================ # Generated by Django 5.2 on 2026-02-15 14:29 import django.db.models.deletion from django.db import migrations import lamindb.base.fields class Migration(migrations.Migration): dependencies = [ ("lamindb", "0179_v2_2_part_2"), ] operations = [ migrations.AlterField( model_name="runartifact", name="artifact", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_run", to="lamindb.artifact", ), ), migrations.AlterField( model_name="runartifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_artifact", to="lamindb.run", ), ), ] ================================================ FILE: lamindb/migrations/0181_v2_2_part_4.py ================================================ # Generated by Django 5.2 on 2026-02-15 15:43 import django.db.models.deletion from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ ("lamindb", "0180_v2_2_part_3"), ] operations = [ migrations.AddField( model_name="block", name="anchor", field=models.ForeignKey( null=True, on_delete=django.db.models.deletion.PROTECT, related_name="children", to="lamindb.block", ), ), migrations.AlterField( model_name="block", name="key", field=models.CharField(db_index=True, max_length=1024, null=True), ), ] ================================================ FILE: lamindb/migrations/0182_v2_2_part_5.py ================================================ # Generated by Django 5.2 on 2026-02-17 16:33 import django.db.models.deletion from django.db import migrations, models import lamindb.base.fields import lamindb.base.users class Migration(migrations.Migration): dependencies = [ ("lamindb", "0181_v2_2_part_4"), ] operations = [ migrations.AddField( model_name="branch", name="_status_code", field=models.SmallIntegerField(db_index=True, default=0), ), migrations.AlterField( model_name="artifactblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="artifactblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="block", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="block", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="branch", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="branchblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="branchblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="collectionblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="collectionblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="featureblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="featureblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="projectblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="projectblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="recordblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="recordblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="runblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="runblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="schemablock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="schemablock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="spaceblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="spaceblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="transformblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="transformblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AlterField( model_name="ulabelblock", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AlterField( model_name="ulabelblock", name="kind", field=models.CharField( db_default="readme", db_index=True, default="readme", max_length=22 ), ), migrations.AddField( model_name="artifactblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="block", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="branchblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="collectionblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="featureblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="projectblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="recordblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="runblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="schemablock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="spaceblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="transformblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AddField( model_name="ulabelblock", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AlterField( model_name="branch", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AlterField( model_name="project", name="_status_code", field=models.SmallIntegerField(db_default=0, db_index=True, default=0), ), migrations.AlterField( model_name="run", name="_status_code", field=models.SmallIntegerField(db_default=-3, db_index=True, default=-3), ), ] ================================================ FILE: lamindb/migrations/0183_v2_2_part_6.py ================================================ # Generated by Django 5.2 on 2026-02-17 23:04 from django.db import migrations class Migration(migrations.Migration): dependencies = [ ("lamindb", "0182_v2_2_part_5"), ] operations = [ migrations.RemoveField( model_name="branch", name="artifacts", ), migrations.DeleteModel( name="BranchArtifact", ), ] ================================================ FILE: lamindb/migrations/0184_alter_transformrecord_feature.py ================================================ # Generated by Django 5.2 on 2026-03-07 12:16 import django.db.models.deletion from django.db import migrations import lamindb.base.fields class Migration(migrations.Migration): dependencies = [ ("lamindb", "0183_v2_2_part_6"), ] operations = [ migrations.AlterField( model_name="transformrecord", name="feature", field=lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transformrecord", to="lamindb.feature", ), ), ] ================================================ FILE: lamindb/migrations/0185_alter_runrecord_feature.py ================================================ # Generated by Django 5.2 on 2026-04-05 14:32 import django.db.models.deletion from django.db import migrations import lamindb.base.fields class Migration(migrations.Migration): dependencies = [ ("lamindb", "0184_alter_transformrecord_feature"), ] operations = [ migrations.AlterField( model_name="runrecord", name="feature", field=lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runrecord", to="lamindb.feature", ), ), ] ================================================ FILE: lamindb/migrations/0186_v2_4.py ================================================ # Generated by Django 5.2 on 2026-04-12 18:49 import django.db.models.deletion from django.db import migrations, models import lamindb.base.fields class Migration(migrations.Migration): dependencies = [ ("lamindb", "0185_alter_runrecord_feature"), ] operations = [ migrations.AddField( model_name="artifactblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="artifactblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="branchblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="collectionblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="collectionblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="featureblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="featureblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="projectblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="projectblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="recordblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="recordblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="runblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="runblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="schemablock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="schemablock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="spaceblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="spaceblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="transformblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="transformblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="ulabelblock", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="ulabelblock", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="block", name="branch", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="block", name="created_on", field=models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="branch", name="_aux", field=lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ] ================================================ FILE: lamindb/migrations/0187_squashed.py ================================================ # Generated by Django 5.2 on 2026-04-16 06:44 import django.core.validators import django.db.models.deletion import django.db.models.functions.datetime import django.db.models.functions.text import pgtrigger.compiler import pgtrigger.migrations from django.db import connection, migrations, models import lamindb.base.fields import lamindb.base.uids import lamindb.base.users import lamindb.models.can_curate import lamindb.models.has_parents import lamindb.models.run import lamindb.models.sqlrecord CREATE_IS_VALID_RECORD_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_record_type(record_type_id INTEGER, record_is_type BOOLEAN) RETURNS BOOLEAN AS $$ BEGIN -- Record with no type is valid IF record_type_id IS NULL THEN RETURN TRUE; END IF; -- If current record is a type, it can only reference schema-less types IF record_is_type THEN RETURN EXISTS ( SELECT 1 FROM lamindb_record r WHERE r.id = record_type_id AND r.is_type AND r.schema_id IS NULL ); END IF; -- Regular records can reference any type RETURN EXISTS ( SELECT 1 FROM lamindb_record r WHERE r.id = record_type_id AND r.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_record ADD CONSTRAINT record_type_is_valid_fk CHECK (is_valid_record_type(type_id, is_type)); """ CREATE_IS_VALID_FEATURE_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_feature_type(feature_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN -- Feature with no type is valid IF feature_type_id IS NULL THEN RETURN TRUE; END IF; -- Type must have is_type = TRUE RETURN EXISTS ( SELECT 1 FROM lamindb_feature f WHERE f.id = feature_type_id AND f.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_feature ADD CONSTRAINT feature_type_is_valid_fk CHECK (is_valid_feature_type(type_id)); """ CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_schema_type(schema_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF schema_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_schema s WHERE s.id = schema_type_id AND s.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_schema ADD CONSTRAINT schema_type_is_valid_fk CHECK (is_valid_schema_type(type_id)); """ CREATE_IS_VALID_PROJECT_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_project_type(project_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF project_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_project p WHERE p.id = project_type_id AND p.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_project ADD CONSTRAINT project_type_is_valid_fk CHECK (is_valid_project_type(type_id)); """ CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_reference_type(reference_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF reference_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_reference r WHERE r.id = reference_type_id AND r.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_reference ADD CONSTRAINT reference_type_is_valid_fk CHECK (is_valid_reference_type(type_id)); """ CREATE_IS_VALID_ULABEL_TYPE_FUNCTION = """ CREATE OR REPLACE FUNCTION is_valid_ulabel_type(ulabel_type_id INTEGER) RETURNS BOOLEAN AS $$ BEGIN IF ulabel_type_id IS NULL THEN RETURN TRUE; END IF; RETURN EXISTS ( SELECT 1 FROM lamindb_ulabel u WHERE u.id = ulabel_type_id AND u.is_type ); END; $$ LANGUAGE plpgsql; """ CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT = """ ALTER TABLE lamindb_ulabel ADD CONSTRAINT ulabel_type_is_valid_fk CHECK (is_valid_ulabel_type(type_id)); """ def apply_constraints(apps, schema_editor): if schema_editor.connection.vendor == "postgresql": schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_RECORD_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_FEATURE_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_SCHEMA_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_PROJECT_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_REFERENCE_TYPE_CONSTRAINT) schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_FUNCTION) schema_editor.execute(CREATE_IS_VALID_ULABEL_TYPE_CONSTRAINT) class Migration(migrations.Migration): replaces = [ ("lamindb", "0177_squashed"), ("lamindb", "0177_alter_artifactblock_artifact_and_more"), ("lamindb", "0178_v2_2"), ("lamindb", "0179_v2_2_part_2"), ("lamindb", "0180_v2_2_part_3"), ("lamindb", "0181_v2_2_part_4"), ("lamindb", "0182_v2_2_part_5"), ("lamindb", "0183_v2_2_part_6"), ("lamindb", "0184_alter_transformrecord_feature"), ("lamindb", "0185_alter_runrecord_feature"), ("lamindb", "0186_v2_4"), ("lamindb", "0187_v2_4_part_2"), ] dependencies = [] # type: ignore operations = [ migrations.CreateModel( name="Migration", fields=[ ( "id", models.BigAutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ( "app", lamindb.base.fields.CharField( blank=True, default=None, max_length=255 ), ), ( "name", lamindb.base.fields.CharField( blank=True, default=None, max_length=255 ), ), ("applied", lamindb.base.fields.DateTimeField(blank=True)), ], options={ "db_table": "django_migrations", "managed": False, }, ), migrations.CreateModel( name="Branch", fields=[ ("id", models.AutoField(primary_key=True, serialize=False)), ("name", models.CharField(db_index=True, max_length=100)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ], ), migrations.CreateModel( name="Space", fields=[ ("id", models.SmallAutoField(primary_key=True, serialize=False)), ("name", models.CharField(db_index=True, max_length=100)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ], ), migrations.CreateModel( name="Artifact", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=20, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024, null=True, ), ), ( "_real_key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024, null=True, ), ), ( "description", lamindb.base.fields.TextField( blank=True, db_index=True, default=None, null=True ), ), ( "suffix", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=30, ), ), ( "kind", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=20, null=True, ), ), ( "otype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=64, null=True, ), ), ( "size", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=22, null=True, ), ), ( "n_files", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "n_observations", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, editable=False, null=True, ), ), ( "_hash_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=30, null=True, ), ), ( "_key_is_virtual", lamindb.base.fields.BooleanField(blank=True, default=None), ), ( "_overwrite_versions", lamindb.base.fields.BooleanField(blank=True, default=None), ), ( "_actions", models.ManyToManyField( related_name="_action_targets", to="lamindb.artifact" ), ), ], options={ "abstract": False, }, ), migrations.CreateModel( name="ArtifactArtifact", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.artifact", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_value", to="lamindb.artifact", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="artifacts", field=models.ManyToManyField( related_name="linked_by_artifacts", through="lamindb.ArtifactArtifact", to="lamindb.artifact", ), ), migrations.CreateModel( name="Block", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ("key", models.CharField(db_index=True, max_length=1024, null=True)), ( "anchor", models.ForeignKey( null=True, on_delete=django.db.models.deletion.PROTECT, related_name="children", to="lamindb.block", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ], ), migrations.CreateModel( name="BlockProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "block", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.block", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AddField( model_name="artifact", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.CreateModel( name="Collection", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_20, editable=False, max_length=20, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "_actions", models.ManyToManyField(related_name="+", to="lamindb.artifact"), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "meta_artifact", lamindb.base.fields.OneToOneField( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_meta_of_collection", to="lamindb.artifact", ), ), ], options={ "abstract": False, }, ), migrations.CreateModel( name="CollectionArtifact", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.artifact", ), ), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="collection", name="artifacts", field=models.ManyToManyField( related_name="collections", through="lamindb.CollectionArtifact", to="lamindb.artifact", ), ), migrations.CreateModel( name="CollectionProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="CollectionReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.collection", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="Feature", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150 ), ), ( "_dtype_str", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "unit", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ("array_rank", models.SmallIntegerField(db_index=True, default=0)), ("array_size", models.IntegerField(db_index=True, default=0)), ( "array_shape", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "synonyms", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "default_value", lamindb.base.fields.JSONField(blank=True, default=None, null=True), ), ( "nullable", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "coerce", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="features", to="lamindb.feature", ), ), ], options={ "abstract": False, }, bases=(lamindb.models.can_curate.CanCurate, models.Model), ), migrations.CreateModel( name="CollectionRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.collection", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collectionrecord", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactRun", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_run", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactrun", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactreference", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactrecord", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactproject", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifactartifact", name="feature", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactartifact", to="lamindb.feature", ), ), migrations.CreateModel( name="FeatureProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.feature", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="JsonValue", fields=[ ( "id", models.BigAutoField( auto_created=True, primary_key=True, serialize=False, verbose_name="ID", ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("value", models.JSONField()), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="values", to="lamindb.feature", ), ), ], options={ "abstract": False, "base_manager_name": "objects", }, ), migrations.CreateModel( name="ArtifactJsonValue", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_jsonvalue", to="lamindb.artifact", ), ), ( "jsonvalue", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.jsonvalue", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="json_values", field=models.ManyToManyField( related_name="artifacts", through="lamindb.ArtifactJsonValue", to="lamindb.jsonvalue", ), ), migrations.CreateModel( name="Project", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "abbr", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32, null=True, ), ), ( "url", lamindb.base.fields.URLField( blank=True, default=None, max_length=255, null=True ), ), ( "start_date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ( "end_date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ( "artifacts", models.ManyToManyField( related_name="projects", through="lamindb.ArtifactProject", to="lamindb.artifact", ), ), ( "blocks", models.ManyToManyField( related_name="projects", through="lamindb.BlockProject", to="lamindb.block", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="projects", through="lamindb.CollectionProject", to="lamindb.collection", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "features", models.ManyToManyField( related_name="projects", through="lamindb.FeatureProject", to="lamindb.feature", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.project" ), ), ( "predecessors", models.ManyToManyField( related_name="successors", to="lamindb.project" ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="projects", to="lamindb.project", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.can_curate.CanCurate, models.Model, lamindb.models.sqlrecord.ValidateFields, ), ), migrations.AddField( model_name="featureproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_feature", to="lamindb.project", ), ), migrations.AddField( model_name="collectionproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.project", ), ), migrations.CreateModel( name="BranchProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.branch", ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.project", ), ), ], options={ "unique_together": {("branch", "project")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="projects", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchProject", to="lamindb.project", ), ), migrations.AddField( model_name="blockproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_block", to="lamindb.project", ), ), migrations.AddField( model_name="artifactproject", name="project", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.project", ), ), migrations.CreateModel( name="Record", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=16, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ("extra_data", models.JSONField(null=True)), ( "artifacts", models.ManyToManyField( related_name="records", through="lamindb.ArtifactRecord", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="records", through="lamindb.CollectionRecord", to="lamindb.collection", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.record" ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="records", to="lamindb.record", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.has_parents.HasParents, lamindb.models.can_curate.CanCurate, models.Model, ), ), migrations.CreateModel( name="ProjectRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_projectrecord", to="lamindb.feature", ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.project", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.record", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="records", field=models.ManyToManyField( related_name="projects", through="lamindb.ProjectRecord", to="lamindb.record", ), ), migrations.AddField( model_name="collectionrecord", name="record", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.record", ), ), migrations.AddField( model_name="artifactrecord", name="record", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.record", ), ), migrations.CreateModel( name="RecordArtifact", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordartifact", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_artifact", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.artifact", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="artifact", name="linked_in_records", field=models.ManyToManyField( related_name="linked_artifacts", through="lamindb.RecordArtifact", to="lamindb.record", ), ), migrations.CreateModel( name="RecordCollection", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordcollection", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_collection", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.collection", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="collection", name="linked_in_records", field=models.ManyToManyField( related_name="linked_collections", through="lamindb.RecordCollection", to="lamindb.record", ), ), migrations.CreateModel( name="RecordProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordproject", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_project", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.project", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="project", name="linked_in_records", field=models.ManyToManyField( related_name="linked_projects", through="lamindb.RecordProject", to="lamindb.record", ), ), migrations.CreateModel( name="RecordRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_record", to="lamindb.record", ), ), ( "value", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.record", ), ), ], options={ "unique_together": {("record", "feature", "value")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="record", name="linked_records", field=models.ManyToManyField( related_name="linked_in_records", through="lamindb.RecordRecord", to="lamindb.record", ), ), migrations.CreateModel( name="RecordReference", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordreference", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_reference", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordRun", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordrun", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_run", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordTransform", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordtransform", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_transform", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordulabel", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_ulabel", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="RecordUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recorduser", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_user", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="Reference", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "abbr", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32, null=True, ), ), ( "url", lamindb.base.fields.URLField(blank=True, db_index=True, null=True), ), ( "pubmed_id", lamindb.base.fields.BigIntegerField( blank=True, db_index=True, default=None, null=True ), ), ( "doi", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, validators=[ django.core.validators.RegexValidator( message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)", regex="^(?:https?://(?:dx\\.)?doi\\.org/|doi:|DOI:)?10\\.\\d+/.*$", ) ], ), ), ( "text", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "date", lamindb.base.fields.DateField(blank=True, default=None, null=True), ), ( "artifacts", models.ManyToManyField( related_name="references", through="lamindb.ArtifactReference", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collections", models.ManyToManyField( related_name="references", through="lamindb.CollectionReference", to="lamindb.collection", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_references", through="lamindb.RecordReference", to="lamindb.record", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="references", to="lamindb.reference", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.can_curate.CanCurate, models.Model, lamindb.models.sqlrecord.ValidateFields, ), ), migrations.AddField( model_name="recordreference", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.reference", ), ), migrations.AddField( model_name="project", name="references", field=models.ManyToManyField( related_name="projects", to="lamindb.reference" ), ), migrations.AddField( model_name="collectionreference", name="reference", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.reference", ), ), migrations.AddField( model_name="artifactreference", name="reference", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.reference", ), ), migrations.CreateModel( name="ReferenceRecord", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_referencerecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.record", ), ), ( "reference", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.reference", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="reference", name="records", field=models.ManyToManyField( related_name="references", through="lamindb.ReferenceRecord", to="lamindb.record", ), ), migrations.CreateModel( name="Run", fields=[ ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "entrypoint", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "started_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "finished_at", lamindb.base.fields.DateTimeField( blank=True, db_index=True, default=None, null=True ), ), ("params", models.JSONField(null=True)), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "cli_args", lamindb.base.fields.CharField( blank=True, default=None, max_length=1024, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_is_consecutive", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "_status_code", models.SmallIntegerField(db_default=-3, db_index=True, default=-3), ), ( "artifacts", models.ManyToManyField( related_name="runs", through="lamindb.ArtifactRun", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "environment", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_environment_of", to="lamindb.artifact", ), ), ( "initiated_by_run", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="initiated_runs", to="lamindb.run", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_runs", through="lamindb.RecordRun", to="lamindb.record", ), ), ( "plan", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_plan_for_runs", to="lamindb.artifact", ), ), ( "report", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="_report_of", to="lamindb.artifact", ), ), ], ), migrations.AddField( model_name="referencerecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="reference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="recordrun", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.run", ), ), migrations.AddField( model_name="record", name="input_of_runs", field=models.ManyToManyField( related_name="input_records", to="lamindb.run" ), ), migrations.AddField( model_name="record", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, editable=False, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_records", to="lamindb.run", ), ), migrations.AddField( model_name="projectrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="project", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="jsonvalue", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="featureproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="feature", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionreference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collectionartifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="collection", name="input_of_runs", field=models.ManyToManyField( related_name="input_collections", to="lamindb.run" ), ), migrations.AddField( model_name="collection", name="recreating_runs", field=models.ManyToManyField( related_name="recreated_collections", to="lamindb.run" ), ), migrations.AddField( model_name="collection", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_collections", to="lamindb.run", ), ), migrations.AddField( model_name="blockproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.CreateModel( name="ArtifactUser", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactuser", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.CreateModel( name="ArtifactULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifactulabel", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifactrun", name="run", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_artifact", to="lamindb.run", ), ), migrations.AddField( model_name="artifactreference", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactrecord", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactproject", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactjsonvalue", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifactartifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), migrations.AddField( model_name="artifact", name="input_of_runs", field=models.ManyToManyField( related_name="input_artifacts", to="lamindb.run" ), ), migrations.AddField( model_name="artifact", name="recreating_runs", field=models.ManyToManyField( related_name="recreated_artifacts", to="lamindb.run" ), ), migrations.AddField( model_name="artifact", name="run", field=lamindb.base.fields.ForeignKey( blank=True, default=None, editable=False, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="output_artifacts", to="lamindb.run", ), ), migrations.CreateModel( name="RunArtifact", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_run", to="lamindb.artifact", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runartifact", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_artifact", to="lamindb.run", ), ), ], options={ "unique_together": {("run", "artifact", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="linked_artifacts", field=models.ManyToManyField( related_name="linked_by_runs", through="lamindb.RunArtifact", to="lamindb.artifact", ), ), migrations.CreateModel( name="RunJsonValue", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "jsonvalue", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.jsonvalue", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_jsonvalue", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="json_values", field=models.ManyToManyField( related_name="runs", through="lamindb.RunJsonValue", to="lamindb.jsonvalue", ), ), migrations.CreateModel( name="RunProject", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="project", name="runs", field=models.ManyToManyField( related_name="projects", through="lamindb.RunProject", to="lamindb.run" ), ), migrations.CreateModel( name="RunRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_runrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.record", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.run", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="record", name="runs", field=models.ManyToManyField( related_name="records", through="lamindb.RunRecord", to="lamindb.run" ), ), migrations.CreateModel( name="Schema", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=16, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "n_members", lamindb.base.fields.IntegerField( blank=True, default=None, null=True ), ), ( "coerce", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "flexible", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "itype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=120, null=True, ), ), ( "otype", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=64, null=True, ), ), ( "_dtype_str", lamindb.base.fields.CharField( blank=True, default=None, editable=False, max_length=64, null=True, ), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=22, null=True, ), ), ( "minimal_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True, editable=False ), ), ( "ordered_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=False, editable=False ), ), ( "maximal_set", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=False, editable=False ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="schemas", to="lamindb.schema", ), ), ], options={ "abstract": False, }, bases=(lamindb.models.can_curate.CanCurate, models.Model), ), migrations.AddField( model_name="record", name="schema", field=lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="records", to="lamindb.schema", ), ), migrations.CreateModel( name="ArtifactSchema", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "slot", lamindb.base.fields.CharField( blank=True, default=None, max_length=255, null=True ), ), ( "feature_ref_is_semantic", lamindb.base.fields.BooleanField( blank=True, default=None, null=True ), ), ( "artifact", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="_links_schema", to="lamindb.artifact", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="_links_artifact", to="lamindb.schema", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="artifact", name="schema", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="validated_artifacts", to="lamindb.schema", ), ), migrations.AddField( model_name="artifact", name="schemas", field=models.ManyToManyField( related_name="artifacts", through="lamindb.ArtifactSchema", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaComponent", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "slot", lamindb.base.fields.CharField( blank=True, default=None, max_length=255, null=True ), ), ( "component", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_composite", to="lamindb.schema", ), ), ( "composite", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_component", to="lamindb.schema", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="schema", name="components", field=models.ManyToManyField( related_name="composites", through="lamindb.SchemaComponent", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaFeature", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_schema", to="lamindb.feature", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_feature", to="lamindb.schema", ), ), ], options={ "unique_together": {("schema", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="feature", name="schemas", field=models.ManyToManyField( related_name="features", through="lamindb.SchemaFeature", to="lamindb.schema", ), ), migrations.CreateModel( name="SchemaProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_schema", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "schema", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.schema", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="schemas", field=models.ManyToManyField( related_name="projects", through="lamindb.SchemaProject", to="lamindb.schema", ), ), migrations.AddField( model_name="schema", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="run", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="reference", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="record", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="project", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="jsonvalue", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="feature", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="collection", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="branch", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.AddField( model_name="artifact", name="space", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), migrations.CreateModel( name="Storage", fields=[ ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_12, editable=False, max_length=12, unique=True, ), ), ( "root", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, unique=True, ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30 ), ), ( "region", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=64, null=True, ), ), ( "instance_uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=12, null=True, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ], options={ "abstract": False, }, ), migrations.AddField( model_name="artifact", name="storage", field=lamindb.base.fields.ForeignKey( blank=True, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="artifacts", to="lamindb.storage", ), ), migrations.CreateModel( name="Transform", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=16, unique=True, ), ), ( "key", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=1024 ), ), ( "description", lamindb.base.fields.TextField( blank=True, db_index=True, default=None, null=True ), ), ( "kind", lamindb.base.fields.CharField( blank=True, db_index=True, default="pipeline", max_length=20 ), ), ( "source_code", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "hash", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=22, null=True, ), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "environment", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="_environment_of_transforms", to="lamindb.artifact", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_transforms", through="lamindb.RecordTransform", to="lamindb.record", ), ), ( "plan", models.ForeignKey( default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="_plan_for_transforms", to="lamindb.artifact", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ], options={ "abstract": False, }, ), migrations.AddField( model_name="run", name="transform", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="runs", to="lamindb.transform", ), ), migrations.AddField( model_name="recordtransform", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_in_record", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="transforms", field=models.ManyToManyField( related_name="projects", through="lamindb.TransformProject", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformRecord", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), editable=False, ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transformrecord", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.record", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_record", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="record", name="transforms", field=models.ManyToManyField( related_name="records", through="lamindb.TransformRecord", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformReference", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "reference", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.reference", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_reference", to="lamindb.transform", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="reference", name="transforms", field=models.ManyToManyField( related_name="references", through="lamindb.TransformReference", to="lamindb.transform", ), ), migrations.CreateModel( name="TransformTransform", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ("config", models.JSONField(default=None, null=True)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), editable=False, ), ), ( "predecessor", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_successor", to="lamindb.transform", ), ), ( "successor", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_predecessor", to="lamindb.transform", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="transform", name="predecessors", field=models.ManyToManyField( related_name="successors", through="lamindb.TransformTransform", to="lamindb.transform", ), ), migrations.CreateModel( name="ULabel", fields=[ ( "is_type", lamindb.base.fields.BooleanField( blank=True, db_default=False, db_index=True, default=False ), ), ( "is_locked", lamindb.base.fields.BooleanField( blank=True, db_default=False, default=False ), ), ( "_aux", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None, null=True ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=lamindb.base.uids.base62_8, editable=False, max_length=8, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150 ), ), ( "description", lamindb.base.fields.TextField(blank=True, default=None, null=True), ), ( "reference", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=255, null=True, ), ), ( "reference_type", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=25, null=True, ), ), ( "artifacts", models.ManyToManyField( related_name="ulabels", through="lamindb.ArtifactULabel", to="lamindb.artifact", ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_ulabels", through="lamindb.RecordULabel", to="lamindb.record", ), ), ( "parents", models.ManyToManyField( related_name="children", to="lamindb.ulabel" ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "space", lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.space", ), ), ( "type", lamindb.base.fields.ForeignKey( blank=True, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="ulabels", to="lamindb.ulabel", ), ), ], options={ "abstract": False, }, bases=( lamindb.models.has_parents.HasParents, lamindb.models.can_curate.CanCurate, models.Model, ), ), migrations.CreateModel( name="TransformULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "transform", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.transform", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_transform", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="transform", name="ulabels", field=models.ManyToManyField( related_name="transforms", through="lamindb.TransformULabel", to="lamindb.ulabel", ), ), migrations.CreateModel( name="RunULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_run", to="lamindb.ulabel", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="run", name="ulabels", field=models.ManyToManyField( related_name="runs", through="lamindb.RunULabel", to="lamindb.ulabel" ), ), migrations.AddField( model_name="recordulabel", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.ulabel", ), ), migrations.CreateModel( name="CollectionULabel", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "collection", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.collection", ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collectionulabel", to="lamindb.feature", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_collection", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="collection", name="ulabels", field=models.ManyToManyField( related_name="collections", through="lamindb.CollectionULabel", to="lamindb.ulabel", ), ), migrations.CreateModel( name="BranchULabel", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_ulabel", to="lamindb.branch", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.ulabel", ), ), ], options={ "unique_together": {("branch", "ulabel")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="branch", name="ulabels", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchULabel", to="lamindb.ulabel", ), ), migrations.AddField( model_name="artifactulabel", name="ulabel", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.ulabel", ), ), migrations.CreateModel( name="ULabelProject", fields=[ ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_ulabel", to="lamindb.project", ), ), ( "run", lamindb.base.fields.ForeignKey( blank=True, default=lamindb.models.run.current_run, null=True, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.run", ), ), ( "ulabel", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_project", to="lamindb.ulabel", ), ), ], bases=(lamindb.models.sqlrecord.IsLink, models.Model), ), migrations.AddField( model_name="project", name="ulabels", field=models.ManyToManyField( related_name="projects", through="lamindb.ULabelProject", to="lamindb.ulabel", ), ), migrations.CreateModel( name="User", fields=[ ("id", models.AutoField(primary_key=True, serialize=False)), ( "uid", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, editable=False, max_length=8, unique=True, ), ), ( "handle", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, unique=True, ), ), ( "name", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=150, null=True, ), ), ( "created_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "updated_at", lamindb.base.fields.DateTimeField( blank=True, db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "artifacts", models.ManyToManyField( related_name="users", through="lamindb.ArtifactUser", to="lamindb.artifact", through_fields=("user", "artifact"), ), ), ( "linked_in_records", models.ManyToManyField( related_name="linked_users", through="lamindb.RecordUser", to="lamindb.record", ), ), ], bases=(models.Model, lamindb.models.can_curate.CanCurate), ), migrations.AddField( model_name="ulabelproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ULabelBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "ulabel", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.ulabel", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="ulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformtransform", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="transformproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="TransformBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ("line_number", models.IntegerField(null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "transform", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.transform", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="transform", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="created_transforms", to="lamindb.user", ), ), migrations.AddField( model_name="storage", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="SpaceBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "space", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.space", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="space", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="schemaproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="schemacomponent", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="SchemaBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "schema", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.schema", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="schema", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="runjsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="RunBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "run", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.run", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="run", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.CASCADE, related_name="created_runs", to="lamindb.user", ), ), migrations.AddField( model_name="referencerecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="reference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="recorduser", name="value", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_record", to="lamindb.user", ), ), migrations.CreateModel( name="RecordBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "record", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.record", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="record", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ProjectUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "role", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32 ), ), ( "project", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.project", ), ), ( "user", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_project", to="lamindb.user", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AddField( model_name="projectrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ProjectBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "project", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.project", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="project", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="project", name="users", field=models.ManyToManyField( related_name="projects", through="lamindb.ProjectUser", to="lamindb.user", ), ), migrations.AddField( model_name="jsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="featureproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="FeatureBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "feature", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.feature", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="feature", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collectionproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="CollectionBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "collection", models.ForeignKey( null=True, on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.collection", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="collectionartifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="collection", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="BranchUser", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "role", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=32 ), ), ( "branch", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="links_user", to="lamindb.branch", ), ), ( "user", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_branch", to="lamindb.user", ), ), ], bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.CreateModel( name="BranchBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "branch", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.branch", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="branch", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="branch", name="users", field=models.ManyToManyField( related_name="branches", through="lamindb.BranchUser", to="lamindb.user" ), ), migrations.AddField( model_name="blockproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="block", name="created_by", field=models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactuser", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactuser", name="user", field=lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_artifact", to="lamindb.user", ), ), migrations.AddField( model_name="artifactulabel", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactschema", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactrun", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactreference", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactrecord", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactproject", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifactjsonvalue", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.CreateModel( name="ArtifactBlock", fields=[ ( "version_tag", lamindb.base.fields.CharField( blank=True, db_index=True, default=None, max_length=30, null=True, ), ), ( "is_latest", lamindb.base.fields.BooleanField( blank=True, db_index=True, default=True ), ), ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "uid", models.CharField( db_index=True, default=lamindb.base.uids.base62_16, editable=False, max_length=20, unique=True, ), ), ("content", models.TextField()), ("hash", models.CharField(db_index=True, max_length=22, null=True)), ( "kind", models.CharField( db_default="readme", db_index=True, default="readme", max_length=22, ), ), ( "created_at", models.DateTimeField( db_default=django.db.models.functions.datetime.Now(), db_index=True, editable=False, ), ), ( "_status_code", models.SmallIntegerField(db_default=0, db_index=True, default=0), ), ("_aux", models.JSONField(db_default=None, default=None, null=True)), ( "artifact", models.ForeignKey( on_delete=django.db.models.deletion.CASCADE, related_name="ablocks", to="lamindb.artifact", ), ), ( "branch", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_on", models.ForeignKey( db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ( "created_by", models.ForeignKey( default=lamindb.base.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), ], ), migrations.AddField( model_name="artifactartifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.user", ), ), migrations.AddField( model_name="artifact", name="created_by", field=lamindb.base.fields.ForeignKey( blank=True, default=lamindb.base.users.current_user_id, editable=False, on_delete=django.db.models.deletion.PROTECT, related_name="created_artifacts", to="lamindb.user", ), ), migrations.CreateModel( name="RecordJson", fields=[ ("id", models.BigAutoField(primary_key=True, serialize=False)), ( "value", lamindb.base.fields.JSONField( blank=True, db_default=None, default=None ), ), ( "feature", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.PROTECT, related_name="links_recordjson", to="lamindb.feature", ), ), ( "record", lamindb.base.fields.ForeignKey( blank=True, on_delete=django.db.models.deletion.CASCADE, related_name="values_json", to="lamindb.record", ), ), ], options={ "unique_together": {("record", "feature")}, }, bases=(models.Model, lamindb.models.sqlrecord.IsLink), ), migrations.AlterUniqueTogether( name="recordreference", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordrun", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordtransform", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="recordulabel", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="ulabelproject", unique_together={("ulabel", "project")}, ), migrations.AlterUniqueTogether( name="transformulabel", unique_together={("transform", "ulabel")}, ), migrations.AlterUniqueTogether( name="transformtransform", unique_together={("successor", "predecessor")}, ), migrations.AlterUniqueTogether( name="transformreference", unique_together={("transform", "reference")}, ), migrations.AlterUniqueTogether( name="transformrecord", unique_together={("transform", "record", "feature")}, ), migrations.AlterUniqueTogether( name="transformproject", unique_together={("transform", "project")}, ), migrations.AlterUniqueTogether( name="transform", unique_together={("key", "hash")}, ), migrations.AddConstraint( model_name="space", constraint=models.UniqueConstraint( django.db.models.functions.text.Lower("name"), name="unique_space_name_lower", ), ), migrations.AlterUniqueTogether( name="schemaproject", unique_together={("schema", "project")}, ), migrations.AlterUniqueTogether( name="schemacomponent", unique_together={("composite", "slot"), ("composite", "slot", "component")}, ), migrations.AlterUniqueTogether( name="runulabel", unique_together={("run", "ulabel")}, ), migrations.AlterUniqueTogether( name="runrecord", unique_together={("run", "record", "feature")}, ), migrations.AlterUniqueTogether( name="runproject", unique_together={("run", "project")}, ), migrations.AlterUniqueTogether( name="runjsonvalue", unique_together={("run", "jsonvalue")}, ), migrations.AlterUniqueTogether( name="referencerecord", unique_together={("reference", "feature", "record")}, ), migrations.AlterUniqueTogether( name="recorduser", unique_together={("record", "feature", "value")}, ), migrations.AlterUniqueTogether( name="projectuser", unique_together={("project", "user", "role")}, ), migrations.AlterUniqueTogether( name="projectrecord", unique_together={("project", "feature", "record")}, ), migrations.AlterUniqueTogether( name="jsonvalue", unique_together={("feature", "hash")}, ), migrations.AlterUniqueTogether( name="featureproject", unique_together={("feature", "project")}, ), migrations.AddConstraint( model_name="feature", constraint=models.CheckConstraint( condition=models.Q( ("is_type", True), ("_dtype_str__isnull", False), _connector="OR" ), name="feature_dtype_str_not_null_when_is_type_false", ), ), migrations.AlterUniqueTogether( name="collectionulabel", unique_together={("collection", "ulabel")}, ), migrations.AlterUniqueTogether( name="collectionreference", unique_together={("collection", "reference")}, ), migrations.AlterUniqueTogether( name="collectionrecord", unique_together={("collection", "record", "feature")}, ), migrations.AlterUniqueTogether( name="collectionproject", unique_together={("collection", "project")}, ), migrations.AlterUniqueTogether( name="collectionartifact", unique_together={("collection", "artifact")}, ), migrations.AddConstraint( model_name="collection", constraint=models.UniqueConstraint( fields=("key", "hash"), name="unique_collection_key_hash_not_null" ), ), migrations.AlterUniqueTogether( name="branchuser", unique_together={("branch", "user", "role")}, ), migrations.AddConstraint( model_name="branch", constraint=models.UniqueConstraint( django.db.models.functions.text.Lower("name"), name="unique_branch_name_lower", ), ), migrations.AlterUniqueTogether( name="blockproject", unique_together={("block", "project")}, ), migrations.AlterUniqueTogether( name="artifactuser", unique_together={("artifact", "user", "feature")}, ), migrations.AlterUniqueTogether( name="artifactulabel", unique_together={("artifact", "ulabel", "feature")}, ), migrations.AlterUniqueTogether( name="artifactschema", unique_together={("artifact", "schema"), ("artifact", "slot")}, ), migrations.AlterUniqueTogether( name="artifactrun", unique_together={("artifact", "run", "feature")}, ), migrations.AlterUniqueTogether( name="artifactreference", unique_together={("artifact", "reference", "feature")}, ), migrations.AlterUniqueTogether( name="artifactrecord", unique_together={("artifact", "record", "feature")}, ), migrations.AlterUniqueTogether( name="artifactproject", unique_together={("artifact", "project", "feature")}, ), migrations.AlterUniqueTogether( name="artifactjsonvalue", unique_together={("artifact", "jsonvalue")}, ), migrations.AlterUniqueTogether( name="artifactartifact", unique_together={("artifact", "value", "feature")}, ), migrations.AddConstraint( model_name="artifact", constraint=models.UniqueConstraint( condition=models.Q(("key__isnull", False)), fields=("storage", "key", "hash"), name="unique_artifact_storage_key_hash_not_null", ), ), migrations.AddConstraint( model_name="artifact", constraint=models.UniqueConstraint( condition=models.Q(("key__isnull", True)), fields=("storage", "hash"), name="unique_artifact_storage_hash_null_key", ), ), migrations.RunPython(apply_constraints), ] if connection.vendor == "postgresql": Migration.operations += [ pgtrigger.migrations.AddTrigger( model_name="ulabel", trigger=pgtrigger.compiler.Trigger( name="prevent_ulabel_type_cycle", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (NEW.type_id IS NOT NULL)", func="\n -- Check for direct self-reference\n IF NEW.type_id = NEW.id THEN\n RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type';\n END IF;\n\n -- Check for cycles in the type chain\n IF EXISTS (\n WITH RECURSIVE type_chain AS (\n SELECT type_id, 1 as depth\n FROM lamindb_ulabel\n WHERE id = NEW.type_id\n\n UNION ALL\n\n SELECT r.type_id, tc.depth + 1\n FROM lamindb_ulabel r\n INNER JOIN type_chain tc ON r.id = tc.type_id\n WHERE tc.depth < 100\n )\n SELECT 1 FROM type_chain WHERE type_id = NEW.id\n ) THEN\n RAISE EXCEPTION 'Cannot set type: would create a cycle';\n END IF;\n\n RETURN NEW;\n ", hash="53487a8e36a64748418457f7229de6d5cf31e6bd", operation="UPDATE OR INSERT", pgid="pgtrigger_prevent_ulabel_type_cycle_863ae", table="lamindb_ulabel", when="BEFORE", ), ), ), pgtrigger.migrations.AddTrigger( model_name="record", trigger=pgtrigger.compiler.Trigger( name="prevent_record_type_cycle", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (NEW.type_id IS NOT NULL)", func="\n -- Check for direct self-reference\n IF NEW.type_id = NEW.id THEN\n RAISE EXCEPTION 'Cannot set type: record cannot be its own type';\n END IF;\n\n -- Check for cycles in the type chain\n IF EXISTS (\n WITH RECURSIVE type_chain AS (\n SELECT type_id, 1 as depth\n FROM lamindb_record\n WHERE id = NEW.type_id\n\n UNION ALL\n\n SELECT r.type_id, tc.depth + 1\n FROM lamindb_record r\n INNER JOIN type_chain tc ON r.id = tc.type_id\n WHERE tc.depth < 100\n )\n SELECT 1 FROM type_chain WHERE type_id = NEW.id\n ) THEN\n RAISE EXCEPTION 'Cannot set type: would create a cycle';\n END IF;\n\n RETURN NEW;\n ", hash="deaab832a066dfec76228f5b7a62a08f334876a9", operation="UPDATE OR INSERT", pgid="pgtrigger_prevent_record_type_cycle_56c18", table="lamindb_record", when="BEFORE", ), ), ), pgtrigger.migrations.AddTrigger( model_name="feature", trigger=pgtrigger.compiler.Trigger( name="update_feature_on_name_change", sql=pgtrigger.compiler.UpsertTriggerSql( condition="WHEN (OLD.name IS DISTINCT FROM NEW.name)", func="DECLARE\n old_renamed JSONB;\n new_renamed JSONB;\n ts TEXT;\nBEGIN\n -- Only proceed if name actually changed\n IF OLD.name IS DISTINCT FROM NEW.name THEN\n -- Update synonyms\n IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN\n NEW.synonyms := OLD.name;\n ELSIF position(OLD.name in NEW.synonyms) = 0 THEN\n NEW.synonyms := NEW.synonyms || '|' || OLD.name;\n END IF;\n\n -- Update _aux with rename history\n ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD\"T\"HH24:MI:SS\"Z\"');\n\n -- Get existing renamed history or initialize empty object\n old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB);\n\n -- Add old name with timestamp\n new_renamed := old_renamed || jsonb_build_object(ts, OLD.name);\n\n -- Update _aux with new renamed history\n IF NEW._aux IS NULL THEN\n NEW._aux := jsonb_build_object('renamed', new_renamed);\n ELSE\n NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed);\n END IF;\n END IF;\n\n RETURN NEW;\nEND;\n", hash="5f2e7a65e42c34b0455f0840def52f078726e401", operation="UPDATE", pgid="pgtrigger_update_feature_on_name_change_6c32d", table="lamindb_feature", when="BEFORE", ), ), ), ] ================================================ FILE: lamindb/migrations/0187_v2_4_part_2.py ================================================ # Generated by Django 5.2 on 2026-04-16 06:38 import django.db.models.deletion from django.db import migrations import lamindb.base.fields class Migration(migrations.Migration): dependencies = [ ("lamindb", "0186_v2_4"), ] operations = [ migrations.RemoveField( model_name="branchblock", name="created_on", ), migrations.AlterField( model_name="block", name="branch", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), migrations.AlterField( model_name="block", name="created_on", field=lamindb.base.fields.ForeignKey( blank=True, db_default=1, default=1, on_delete=django.db.models.deletion.PROTECT, related_name="+", to="lamindb.branch", ), ), ] ================================================ FILE: lamindb/migrations/README.md ================================================ # Attention Remember that lamindb schema changes that do not work on old databases (like adding columns or tables) cannot be deployed to cloud functions unless these instances are migrated. ================================================ FILE: lamindb/migrations/__init__.py ================================================ ================================================ FILE: lamindb/models/__init__.py ================================================ """Auxiliary models & database library. Registry basics --------------- .. autoclass:: BaseSQLRecord .. autoclass:: SQLRecord .. autoclass:: Registry .. autoclass:: BasicQuerySet .. autoclass:: QuerySet Mixins for registries --------------------- .. autoclass:: IsVersioned .. autoclass:: HasType .. autoclass:: HasParents .. autoclass:: CanCurate .. autoclass:: TracksRun .. autoclass:: TracksUpdates Managers -------- .. autoclass:: FeatureManager .. autoclass:: LabelManager .. autoclass:: QueryManager .. autoclass:: RelatedManager Annotations of objects ---------------------- Artifact, run, collection, annotations can be conditioned on features. Besides linking categorical data, you can also link simple data types by virtue of the `JsonValue` model. .. autoclass:: JsonValue Annotating artifacts. .. autoclass:: ArtifactArtifact .. autoclass:: ArtifactJsonValue .. autoclass:: ArtifactProject .. autoclass:: ArtifactRecord .. autoclass:: ArtifactReference .. autoclass:: ArtifactRun .. autoclass:: ArtifactSchema .. autoclass:: ArtifactULabel .. autoclass:: ArtifactUser Annotating collections. .. autoclass:: CollectionArtifact .. autoclass:: CollectionProject .. autoclass:: CollectionReference .. autoclass:: CollectionULabel .. autoclass:: CollectionRecord Annotating runs. .. autoclass:: RunJsonValue .. autoclass:: RunProject .. autoclass:: RunULabel .. autoclass:: RunRecord Annotating transforms. .. autoclass:: TransformProject .. autoclass:: TransformReference .. autoclass:: TransformULabel Building relationships among transforms. .. autoclass:: TransformTransform Annotating features, blocks, and ulabels with projects. .. autoclass:: FeatureProject .. autoclass:: BlockProject .. autoclass:: ULabelProject .. autoclass:: SchemaProject .. autoclass:: ProjectRecord Building schemas. .. autoclass:: SchemaComponent .. autoclass:: SchemaFeature Annotating references with records. .. autoclass:: ReferenceRecord Record values ------------- Record values work almost exactly like artifact and run annotations, with the exception that JSON values are stored in `RecordJson` on a per-record basis and not in `JsonValue`. .. autoclass:: RecordArtifact .. autoclass:: RecordCollection .. autoclass:: RecordJson .. autoclass:: RecordProject .. autoclass:: RecordRecord .. autoclass:: RecordReference .. autoclass:: RecordRun .. autoclass:: RecordTransform .. autoclass:: RecordULabel .. autoclass:: RecordUser .. autoclass:: TransformRecord Blocks ------ .. autoclass:: BaseBlock .. autoclass:: Block .. autoclass:: ArtifactBlock .. autoclass:: BranchBlock .. autoclass:: CollectionBlock .. autoclass:: FeatureBlock .. autoclass:: ProjectBlock .. autoclass:: RecordBlock .. autoclass:: RunBlock .. autoclass:: SchemaBlock .. autoclass:: SpaceBlock .. autoclass:: TransformBlock .. autoclass:: ULabelBlock Utils ----- .. autoclass:: LazyArtifact .. autoclass:: InspectResult .. autoclass:: ValidateFields .. autoclass:: SchemaOptionals .. autoclass:: lamindb.models.query_set.BiontyDB .. autoclass:: lamindb.models.query_set.PertdbDB """ # ruff: noqa: I001 from lamin_utils._inspect import InspectResult from ._is_versioned import IsVersioned from .can_curate import CanCurate from .sqlrecord import ( BaseSQLRecord, SQLRecord, Registry, Space, Branch, Migration, ValidateFields, format_field_value, IsLink, HasType, ) from .storage import Storage from .transform import Transform, TransformTransform from .run import Run, TracksRun, TracksUpdates, current_run, User from .feature import Feature, JsonValue from .schema import Schema from .ulabel import ULabel # should come last as it needs everything else from .artifact import Artifact, LazyArtifact from ._feature_manager import FeatureManager from ._label_manager import LabelManager from .collection import Collection, CollectionArtifact from .project import Project, Reference from .query_manager import RelatedManager, QueryManager from .query_set import BasicQuerySet, QuerySet, DB, SQLRecordList from .artifact_set import ArtifactSet from .has_parents import HasParents from datetime import datetime as _datetime # link models from .artifact import ArtifactJsonValue, ArtifactArtifact, ArtifactUser, ArtifactRun from .project import ( ArtifactProject, ArtifactReference, BlockProject, CollectionProject, CollectionReference, FeatureProject, ProjectRecord, RecordProject, RecordReference, ReferenceRecord, RunProject, SchemaProject, TransformProject, TransformReference, ULabelProject, ) from .run import RunJsonValue from .schema import ( SchemaFeature, ArtifactSchema, SchemaComponent, SchemaOptionals, ) from .ulabel import ArtifactULabel, TransformULabel, RunULabel, CollectionULabel from .record import ( Record, ArtifactRecord, CollectionRecord, RecordArtifact, RecordCollection, RecordJson, RecordRecord, RecordRun, RecordTransform, RecordULabel, RecordUser, RunRecord, TransformRecord, ) from .block import ( BaseBlock, Block, ArtifactBlock, BranchBlock, CollectionBlock, FeatureBlock, ProjectBlock, RecordBlock, RunBlock, SchemaBlock, SpaceBlock, TransformBlock, ULabelBlock, ) FeatureValue = JsonValue # backward compatibility ================================================ FILE: lamindb/models/_describe.py ================================================ from __future__ import annotations import re from types import SimpleNamespace from typing import TYPE_CHECKING, Literal from django.db import connections from django.db.models import Q from lamin_utils import colors, logger from rich.table import Column, Table from rich.text import Text from rich.tree import Tree from lamindb.models import BaseSQLRecord, Branch, Run from ._is_versioned import IsVersioned from .sqlrecord import SQLRecord, format_field_value if TYPE_CHECKING: from lamindb.models import Artifact, Collection, Record, Schema, Transform from .run import TracksRun # Define consistent column widths for use in other modules NAME_WIDTH = 30 TYPE_WIDTH = 35 # types can get long, e.g. cat[Record[Treatment]] VALUES_WIDTH = 40 def strip_ansi_from_string(text: str) -> str: """Remove ANSI escape sequences from a string.""" ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") return ansi_escape.sub("", text) def format_rich_tree( tree: Tree, return_str: bool = False, strip_ansi: bool = True ) -> str | None: from rich.console import Console from ..core._context import is_run_from_ipython console = Console(force_terminal=True) printed = False if return_str: from io import StringIO string_io = StringIO() str_console = Console(file=string_io, force_terminal=True) str_console.print(tree) result = string_io.getvalue() if strip_ansi: result = strip_ansi_from_string(result) # rstrip trailing whitespace on every line result = "\n".join(line.rstrip() for line in result.splitlines()) return result try: if not is_run_from_ipython: from IPython import get_ipython from IPython.core.interactiveshell import InteractiveShell from IPython.display import display shell = get_ipython() if isinstance(shell, InteractiveShell): display(tree) printed = True return None except (NameError, ImportError): pass if not printed: # be careful to test this on a terminal console = Console(force_terminal=True) console.print(tree) return None def format_run_title( record: Run | SimpleNamespace | None, transform_key: str | None = None, dim: bool = False, ) -> Text: if record is None: return Text("") display_name = ( Text(record.name, style="cyan3") if record.name is not None else Text(record.uid[:7], style="cyan3") ) if transform_key is None: transform_key = record.transform.key title = Text.assemble( display_name, (" (", "dim"), (transform_key, "cyan3"), (")", "dim"), ) return title def format_title_with_version( record: IsVersioned | SimpleNamespace, ) -> Text: title_str = record.key if record.key is not None else "" title = Text.assemble( (title_str, "cyan3"), (f" ({record.version})", "dim"), Text.assemble(("\n| description: ", "dim"), record.description) if record.description else Text(""), ) return title def describe_header(record: BaseSQLRecord) -> Tree: if isinstance(record, IsVersioned) and not record.is_latest: logger.warning( f"This is not the latest version of the {record.__class__.__name__}." ) if isinstance(record, SQLRecord): if record.branch_id == 0: logger.warning("This artifact is archived.") elif record.branch_id == -1: logger.warning("This artifact is in the trash.") if isinstance(record, Run): title = format_run_title(record, dim=True) # dim makes the uid grey elif isinstance(record, IsVersioned) or isinstance(record, SimpleNamespace): title = format_title_with_version(record) else: display_field = ( record._name_field if hasattr(record, "_name_field") else "name" if hasattr(record, "name") else "" ) display_value = getattr(record, display_field, None) if display_field else None if display_value in (None, ""): display_value = record.uid[:7] if hasattr(record, "uid") else "" title = Text.assemble( ( str(display_value), "cyan3", ) ) tree = Tree( Text.assemble( (f"{record.__class__.__name__}: ", "bold"), title, ), guide_style="dim", # dim the connecting lines ) return tree def format_bytes(bytes_value): """Convert bytes to human readable format.""" if bytes_value < 1024: return f"{bytes_value} B" elif bytes_value < 1024**2: return f"{bytes_value / 1024:.1f} KB" elif bytes_value < 1024**3: return f"{bytes_value / (1024**2):.1f} MB" elif bytes_value < 1024**4: return f"{bytes_value / (1024**3):.1f} GB" else: return f"{bytes_value / (1024**4):.1f} TB" def append_uid_run(record: TracksRun, two_column_items: list, fk_data=None) -> None: if fk_data and "run" in fk_data and fk_data["run"] and fk_data["run"]["id"]: run, transform_key = ( SimpleNamespace(**fk_data["run"]), fk_data["run"]["transform_key"], ) elif record.run is not None: run, transform_key = record.run, record.run.transform.key else: run, transform_key = None, None text_uid = Text.assemble(("uid: ", "dim"), f"{record.uid}") text_run = Text.assemble( ("run: ", "dim"), format_run_title(run, transform_key=transform_key) ) two_column_items.append(text_uid) two_column_items.append(text_run) def append_branch_space_created_at_created_by( record: SQLRecord, two_column_items, fk_data=None ): # branch branch_name = fk_data["branch"]["name"] if fk_data else record.branch.name two_column_items.append(Text.assemble(("branch: ", "dim"), branch_name)) # space space_name = fk_data["space"]["name"] if fk_data else record.space.name two_column_items.append(Text.assemble(("space: ", "dim"), space_name)) # created_at two_column_items.append( Text.assemble(("created_at: ", "dim"), format_field_value(record.created_at)) ) # created_by / "name" in fk_data holds handle, is display name created_by_handle = ( fk_data["created_by"]["name"] if fk_data else record.created_by.handle ) two_column_items.append(Text.assemble(("created_by: ", "dim"), created_by_handle)) def add_two_column_items_to_tree(tree: Tree, two_column_items: list) -> None: table = Table( Column("", no_wrap=True), Column("", no_wrap=True), show_header=False, box=None, pad_edge=False, ) for i in range(0, len(two_column_items), 2): if i + 1 < len(two_column_items): left_item = two_column_items[i] right_item = two_column_items[i + 1] table.add_row(left_item, right_item) else: table.add_row(two_column_items[i], "") tree.add(table) def describe_artifact( record: Artifact, related_data: dict | None = None, ) -> Tree: from ._feature_manager import describe_features from ._label_manager import describe_labels if related_data is not None: fk_data = related_data.get("fk", {}) else: fk_data = {} tree = describe_header(record) dataset_features_tree, external_features_tree = describe_features( record, related_data=related_data, ) labels_tree = describe_labels(record, related_data=related_data) two_column_items = [] # type: ignore append_uid_run(record, two_column_items, fk_data) if record.kind or record.otype: two_column_items.append(Text.assemble(("kind: ", "dim"), f"{record.kind}")) two_column_items.append(Text.assemble(("otype: ", "dim"), f"{record.otype}")) two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}")) two_column_items.append( Text.assemble(("size: ", "dim"), f"{format_bytes(record.size)}") ) append_branch_space_created_at_created_by(record, two_column_items, fk_data) if record.n_observations: two_column_items.append( Text.assemble(("n_observations: ", "dim"), f"{record.n_observations}") ) if record.n_files: two_column_items.append( Text.assemble(("n_files: ", "dim"), f"{record.n_files}") ) schema_name = None if fk_data and "schema" in fk_data and fk_data["schema"]: schema_name = fk_data["schema"]["name"] elif record.schema_id is not None and record.schema is not None: schema_name = ( record.schema.name if record.schema.name is not None else record.schema.uid[:7] ) if schema_name is not None: two_column_items.append(Text.assemble(("schema: ", "dim"), schema_name)) add_two_column_items_to_tree(tree, two_column_items) storage_root = fk_data["storage"]["name"] if fk_data else record.storage.root storage_key = ( record.key if not record._key_is_virtual else record._real_key if record._real_key else f".lamindb/{record.uid}" ) if record.uid in storage_key: if record.overwrite_versions: storage_key = storage_key[:-4] storage_key = f"{storage_key}{record.suffix}" tree.add( Text.assemble( ("storage/path: ", "dim"), (storage_root, "cyan3"), ("/", "dim"), storage_key, ) ) if dataset_features_tree: tree.add(dataset_features_tree) if external_features_tree: tree.add(external_features_tree) if labels_tree: tree.add(labels_tree) return tree def describe_collection( record: Collection, related_data: dict | None = None, ) -> Tree: tree = describe_header(record) if related_data is not None: fk_data = related_data.get("fk", {}) else: fk_data = {} two_column_items = [] # type: ignore append_uid_run(record, two_column_items, fk_data) append_branch_space_created_at_created_by(record, two_column_items, fk_data) add_two_column_items_to_tree(tree, two_column_items) return tree def display_text( text: str, title: str, tree: Tree, max_lines: int = 30, uid: str = "" ) -> None: # Split the code into lines and add dim vertical bars lines = text.split("\n") end_parts = [("\n│ …", "grey30")] if len(lines) > max_lines else [] parts = [(title + ": ", "purple")] parts.append((uid, "")) max_length = 80 for line in lines[:max_lines]: parts.append(("\n│ ", "dim")) parts.append((line[:max_length], "grey30")) if len(line) > max_length: parts.append((" …", "grey30")) parts.extend(end_parts) tree.add(Text.assemble(*parts)) def describe_run( record: Run, related_data: dict | None = None, ) -> Tree: from ._feature_manager import describe_features tree = describe_header(record) if related_data is not None: fk_data = related_data.get("fk", {}) else: fk_data = {} _, features_tree = describe_features( record, related_data=related_data, ) two_column_items = [] # type: ignore two_column_items.append(Text.assemble(("uid: ", "dim"), f"{record.uid}")) if fk_data and "transform" in fk_data: transform = SimpleNamespace(**fk_data["transform"], description="") else: transform = record.transform transform_key = transform.key if transform and transform.key is not None else "" transform_version = ( f" ({transform.version})" if transform and transform.version is not None else "" ) two_column_items.append( Text.assemble( ("transform: ", "dim"), (transform_key, "cyan3"), (transform_version, "dim"), ) ) two_column_items.append( Text.assemble( ("started_at: ", "dim"), format_field_value(record.started_at, none="") ) ) two_column_items.append( Text.assemble( ("finished_at: ", "dim"), format_field_value(record.finished_at, none="") ) ) two_column_items.append(Text.assemble(("status: ", "dim"), record.status)) two_column_items.append( Text.assemble(("reference: ", "dim"), record.reference) if record.reference else Text("") ) append_branch_space_created_at_created_by(record, two_column_items, fk_data) add_two_column_items_to_tree(tree, two_column_items) if record.cli_args: display_text( record.cli_args.strip(), "cli_args", tree, max_lines=4, ) if record.report_id: report = record.report.load(is_run_input=False) if report: report_str = report if isinstance(report, str) else str(report) display_text( strip_ansi_from_string(report_str.strip()), "report", tree, max_lines=4, uid=record.report.uid[:7], ) if record.environment_id: env_result = record.environment.load(is_run_input=False) env_str = env_result if isinstance(env_result, str) else str(env_result) display_text( env_str.strip(), "environment", tree, max_lines=4, uid=record.environment.uid[:7], ) if record.params: params = tree.add(Text("Params", style="bold dark_orange")) for key, value in record.params.items(): params.add(f"{key}: {value}") if features_tree: tree.add(features_tree) return tree def describe_record( record: Record, related_data: dict | None = None, ) -> Tree: from ._feature_manager import describe_features tree = describe_header(record) if related_data is not None: fk_data = related_data.get("fk", {}) else: fk_data = {} _, features_tree = describe_features( record, related_data=related_data, ) two_column_items = [] # type: ignore append_uid_run(record, two_column_items, fk_data) type_name = ( fk_data["type"]["name"] if fk_data and "type" in fk_data and fk_data["type"] else record.type.name if record.type_id is not None else "" ) if type_name is None: type_name = "" two_column_items.append(Text.assemble(("type: ", "dim"), type_name)) two_column_items.append(Text.assemble(("is_type: ", "dim"), f"{record.is_type}")) schema_name = ( fk_data["schema"]["name"] if fk_data and "schema" in fk_data and fk_data["schema"] else record.schema.name if record.schema_id is not None else "" ) if schema_name is None: schema_name = "" two_column_items.append(Text.assemble(("schema: ", "dim"), schema_name)) reference = record.reference if record.reference is not None else "" two_column_items.append(Text.assemble(("reference: ", "dim"), reference)) append_branch_space_created_at_created_by(record, two_column_items, fk_data) add_two_column_items_to_tree(tree, two_column_items) if features_tree: tree.add(features_tree) return tree def describe_transform( record: Transform, related_data: dict | None = None, ) -> Tree: tree = describe_header(record) if related_data is not None: fk_data = related_data.get("fk", {}) else: fk_data = {} two_column_items = [] # type: ignore two_column_items.append(Text.assemble(("uid: ", "dim"), f"{record.uid}")) two_column_items.append( Text.assemble(("reference: ", "dim"), record.reference) if record.reference else Text("") ) two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}")) two_column_items.append(Text.assemble(("type: ", "dim"), f"{record.type}")) append_branch_space_created_at_created_by(record, two_column_items, fk_data) add_two_column_items_to_tree(tree, two_column_items) if record.source_code: display_text(record.source_code.strip(), "source_code", tree) return tree def describe_branch(record: Branch) -> Tree: tree = describe_header(record) two_column_items = [] # type: ignore two_column_items.append(Text.assemble(("status: ", "dim"), record.status)) two_column_items.append(Text.assemble(("space: ", "dim"), record.space.name)) two_column_items.append( Text.assemble(("created_at: ", "dim"), format_field_value(record.created_at)) ) two_column_items.append( Text.assemble(("created_by: ", "dim"), record.created_by.handle) ) add_two_column_items_to_tree(tree, two_column_items) return tree def describe_schema(record: Schema, slot: str | None = None) -> Tree: from ._feature_manager import format_dtype_for_display, strip_cat if record.type: prefix = f" {record.type.name} · " else: prefix = " " if record.name: name = record.name else: name = "unnamed" header = "Schema:" if slot is None else f"{slot}:" description = ( Text.assemble(("\n| description: ", "dim"), record.description) if record.description else Text("") ) tree = Tree( Text.assemble( (header, "bold"), (f"{prefix}", "dim"), (f"{name}", "cyan3"), description ), guide_style="dim", ) two_column_items = [] # type: ignore append_uid_run(record, two_column_items) two_column_items.append(Text.assemble(("itype: ", "dim"), f"{record.itype}")) two_column_items.append(Text.assemble(("otype: ", "dim"), f"{record.otype}")) two_column_items.append(Text.assemble(("hash: ", "dim"), f"{record.hash}")) two_column_items.append( Text.assemble(("ordered_set: ", "dim"), f"{record.ordered_set}") ) two_column_items.append( Text.assemble(("maximal_set: ", "dim"), f"{record.maximal_set}") ) two_column_items.append( Text.assemble(("minimal_set: ", "dim"), f"{record.minimal_set}") ) append_branch_space_created_at_created_by(record, two_column_items) add_two_column_items_to_tree(tree, two_column_items) # Add features section n_members = record.n_members members_count_display = f" ({n_members})" if n_members else "" if n_members or (record.dtype and record.itype is not None): features = tree.add( Text.assemble( ( "Features" if record.itype == "Feature" else record.itype, "bold bright_magenta", ), (members_count_display, "bold dim"), ) ) if n_members is not None: feature_table = Table( show_header=True, header_style="dim", box=None, pad_edge=False ) feature_table.add_column("name", style="", no_wrap=True) feature_table.add_column("dtype", style="", no_wrap=True) feature_table.add_column("optional", style="", no_wrap=True) feature_table.add_column("nullable", style="", no_wrap=True) feature_table.add_column("coerce", style="", no_wrap=True) feature_table.add_column("default_value", style="", no_wrap=True) optionals = record.optionals.get() for member in record.members: feature_table.add_row( Text(member.name), Text(strip_cat(format_dtype_for_display(member._dtype_str))), "✓" if optionals.filter(uid=member.uid).exists() else "✗", "✓" if member.nullable else "✗", "✓" if record.coerce or member.coerce else "✗", str(member.default_value) if member.default_value else "unset", ) features.add(feature_table) elif record.dtype: features.add(Text.assemble(("dtype: ", "dim"), f"{record.dtype}")) return tree def describe_postgres(record): from ._django import get_artifact_or_run_with_related, get_collection_with_related model_name = record.__class__.__name__ msg = f"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\n" if record._state.db is not None and record._state.db != "default": msg += f" {colors.italic('Database instance')}\n" msg += f" slug: {record._state.db}\n" if model_name in {"Artifact", "Run"}: result = get_artifact_or_run_with_related( record, include_feature_link=True, include_fk=True, include_m2m=True, include_schema=True, ) related_data = result.get("related_data", {}) if model_name == "Artifact": tree = describe_artifact(record, related_data=related_data) else: tree = describe_run(record, related_data=related_data) elif model_name == "Record": result = get_artifact_or_run_with_related( record, include_feature_link=True, include_fk=True, ) related_data = result.get("related_data", {}) tree = describe_record(record, related_data=related_data) elif model_name == "Collection": result = get_collection_with_related(record, include_fk=True) related_data = result.get("related_data", {}) tree = describe_collection(record, related_data=related_data) elif model_name == "Transform": tree = describe_transform(record) elif model_name == "Branch": tree = describe_branch(record) else: tree = describe_header(record) return tree def describe_sqlite(record): model_name = record.__class__.__name__ msg = f"{colors.green(model_name)}{record.__repr__(include_foreign_keys=False).lstrip(model_name)}\n" if record._state.db is not None and record._state.db != "default": msg += f" {colors.italic('Database instance')}\n" msg += f" slug: {record._state.db}\n" fields = record._meta.fields direct_fields = [] foreign_key_fields = [] for f in fields: if f.is_relation: foreign_key_fields.append(f.name) else: direct_fields.append(f.name) if not record._state.adding: # prefetch foreign key relationships record = ( record.__class__.objects.using(record._state.db) .select_related(*foreign_key_fields) .get(id=record.id) ) # prefetch m-2-m relationships many_to_many_fields = [] if model_name in {"Artifact", "Collection"}: many_to_many_fields.append("input_of_runs") if model_name == "Artifact": many_to_many_fields.append("schemas") record = ( record.__class__.objects.using(record._state.db) .prefetch_related(*many_to_many_fields) .get(id=record.id) ) if model_name in {"Artifact", "Run", "Record"}: if model_name == "Artifact": tree = describe_artifact(record) elif model_name == "Run": tree = describe_run(record) else: tree = describe_record(record) elif model_name == "Collection": tree = describe_collection(record) elif model_name == "Transform": tree = describe_transform(record) elif model_name == "Branch": tree = describe_branch(record) else: tree = describe_header(record) return tree def append_readme_blocks_to_tree( record, tree: Tree, include: None | Literal["comments"] = None ) -> None: """Append readme (and optionally comment) block content to the describe tree.""" if record._state.adding: return if not hasattr(record, "ablocks"): return if include == "comments": blocks_qs = record.ablocks.filter( Q(kind="readme", is_latest=True) | Q(kind="comment") ).select_related("created_by") else: blocks_qs = record.ablocks.filter(kind="readme", is_latest=True) blocks = list(blocks_qs.order_by("created_at")) # README first, then comments; each group sorted chronologically readme_blocks = [b for b in blocks if b.kind == "readme"] comment_blocks = [b for b in blocks if b.kind == "comment"] for block in readme_blocks + comment_blocks: if block.kind == "readme": title = "README" else: handle = block.created_by.handle if block.created_by else "?" created_at_str = format_field_value(block.created_at) title = f"comment by {handle} at {created_at_str}" display_text( block.content, title, tree, max_lines=30, uid="", ) def describe_postgres_sqlite( record, return_str: bool = False, include: None | Literal["comments"] = None, ) -> str | None: from ._describe import format_rich_tree if ( not record._state.adding and connections[record._state.db].vendor == "postgresql" ): tree = describe_postgres(record) else: tree = describe_sqlite(record) append_readme_blocks_to_tree(record, tree, include=include) return format_rich_tree(tree, return_str=return_str) ================================================ FILE: lamindb/models/_django.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any from django.contrib.postgres.aggregates import ArrayAgg from django.db import connection from django.db.models import CharField, F, OuterRef, Q, Subquery from django.db.models.fields.related import ForeignKey, ManyToManyField from django.db.models.fields.reverse_related import ManyToManyRel, ManyToOneRel from django.db.models.functions import JSONObject from ._relations import dict_related_model_to_related_name, get_schema_modules from .schema import Schema if TYPE_CHECKING: from .artifact import Artifact, Collection from .record import Record from .run import Run def patch_many_to_many_descriptor() -> None: """Patches Django's `ManyToManyDescriptor.__get__` method to suggest better errors when saving relationships of an unsaved model. Before this patch: Cryptic errors are raised when relationships of an unsaved record are attempted to be modified. After this patch: Attempts to access M2M relationships on unsaved objects will raise ValueError, suggesting explicit .save() of the record to be modified before relationship creation. """ from django.db.models.fields.related_descriptors import ManyToManyDescriptor original_get = ManyToManyDescriptor.__get__ def patched_get(self, instance, cls=None): if instance is not None and instance.pk is None: raise ValueError( f"You are trying to access the many-to-many relationships of an unsaved {instance.__class__.__name__} object. " f"Please save it first using '.save()'." ) manager = original_get(self, instance, cls) if manager is None or not hasattr(manager, "add"): return manager original_manager_add = manager.add def patched_manager_add(*objs, **kwargs): try: return original_manager_add(*objs, **kwargs) except ValueError as e: if "Cannot add" in str(e) and "database" in str(e): source_db = manager.instance._state.db raise ValueError( f"Cannot label a record from instance '{source_db}'. " f"Please save the record first to your instance using '.save()'." ) from None raise manager.add = patched_manager_add return manager ManyToManyDescriptor.__get__ = patched_get def get_related_model(model, field_name): try: field = model._meta.get_field(field_name) if isinstance(field, (ForeignKey, ManyToManyField)): # Forward ForeignKey or ManyToManyField return field.remote_field.model elif isinstance(field, (ManyToOneRel, ManyToManyRel)): # Reverse ForeignKey or ManyToManyField return field.related_model else: return f"Unexpected field type: {type(field)}" except Exception as e: return f"Error: {str(e)}" def get_artifact_or_run_with_related( record: Artifact | Run | Record, include_fk: bool = False, include_m2m: bool = False, include_feature_link: bool = False, include_schema: bool = False, ) -> dict[str, Any]: """Fetch an artifact with its related data.""" from ._label_manager import EXCLUDE_LABELS from .can_curate import get_name_field from .query_set import get_default_branch_ids model = record.__class__ is_record = record.__class__.__name__ == "Record" is_artifact = record.__class__.__name__ == "Artifact" entity_field_name = record.__class__.__name__.lower() if entity_field_name in {"run", "record"} and include_schema: include_schema = False # runs do not have feature sets schema_modules = get_schema_modules(record._state.db) foreign_key_fields = [ f.name for f in model._meta.fields if f.is_relation and f.related_model.__get_module_name__() in schema_modules ] # Create the map that the conversion function will need. # It maps the target model class to the m2m field name, e.g., # {'Ulabel': 'ulabels', 'CellType': 'cell_types'} m2m_model_to_field_map = {} if include_m2m: full_map = dict_related_model_to_related_name(model, instance=record._state.db) m2m_model_to_field_map = { model_cls: field_name for model_cls, field_name in full_map.items() if not field_name.startswith("_") and field_name not in EXCLUDE_LABELS } if is_record: m2m_model_to_field_map["Run"] = "linked_runs" else: m2m_model_to_field_map["Run"] = "runs" link_tables = ( [] if not include_feature_link else list( dict_related_model_to_related_name( model, links=True, instance=record._state.db ).values() ) ) # Clear previous queries connection.queries_log.clear() annotations = {} if include_fk: for fk in foreign_key_fields: name_field = get_name_field(get_related_model(model, fk)) if fk == "run": annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), name=F(f"{fk}__name"), uid=F(f"{fk}__uid"), transform_key=F(f"{fk}__transform__key"), ) elif fk == "transform": annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), key=F(f"{fk}__key"), uid=F(f"{fk}__uid"), version=F(f"{fk}__version_tag"), ) elif fk == "created_by": annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}") ) else: annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}") ) for link in link_tables: link_model = getattr(model, link).rel.related_model if not hasattr(link_model, "feature"): continue if not is_record and link_model.__name__ in { "RecordArtifact", "RecordRun", }: continue if is_record and ( not link_model.__name__.startswith("Record") or link_model.__name__ in { "RecordJson", } ): continue if not is_record and not link_model.__name__ == "ArtifactArtifact": if link_model.__name__ == "RunArtifact": if is_artifact: continue else: label_field = "artifact" else: label_field = link.removeprefix("links_").replace("_", "") else: label_field = "value" related_model = link_model._meta.get_field(label_field).related_model # manually include "name" as pertdb.Compound.name is a TextField due to no length limitation char_field_names = [ field.name for field in related_model._meta.concrete_fields if isinstance(field, CharField) or field.name == "name" ] name_field = get_name_field(related_model) label_field_name = f"{label_field}__{name_field}" filter_kwargs = {entity_field_name: OuterRef("pk")} if link_model.__name__ not in { "RecordUser", "ArtifactUser", }: # user does not have branch filter_kwargs[f"{label_field}__branch_id__in"] = get_default_branch_ids() annotations[f"linkfield_{link}"] = Subquery( link_model.objects.filter(**filter_kwargs) .annotate( data=JSONObject( id=F("id"), feature=F("feature"), **{label_field: F(label_field)}, **{ label_field + "_display": F(label_field_name) }, # display field is the name field **{uf: F(f"{label_field}__{uf}") for uf in char_field_names}, ) ) .values(entity_field_name) .annotate(json_agg=ArrayAgg("data")) .values("json_agg") ) if include_schema: annotations["m2m_schemas"] = Subquery( model.schemas.through.objects.filter(artifact=OuterRef("pk")) .annotate( data=JSONObject( id=F("id"), slot=F("slot"), schema=F("schema"), ) ) .values(entity_field_name) .annotate(json_agg=ArrayAgg("data")) .values("json_agg") ) record_meta = ( model.objects.using(record._state.db) .filter(uid=record.uid) .annotate(**annotations) .values(*["id", "uid"], *annotations.keys()) .first() ) if not record_meta: return None related_data: dict = {"m2m": {}, "fk": {}, "link": {}, "m2m_schemas": {}} for k, v in record_meta.items(): if k.startswith("fkfield_") and v is not None: related_data["fk"][k[8:]] = v elif k.startswith("linkfield_") and v is not None: related_data["link"][k[10:]] = v elif k == "m2m_schemas": if v: related_data["m2m_schemas"] = get_schema_m2m_relations( record, {i["schema"]: i["slot"] for i in v} ) def convert_link_data_to_m2m( link_data: dict, model, # The main artifact model class is still needed for introspection m2m_model_map: dict, # The pre-computed map from Step 1 ) -> dict: """Converts link data to M2M-style data using a pre-computed model-to-field-name map.""" # link_data: {'links_tissue': [{'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}], 'links_cell_type': [{'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}]} m2m_data = {} for link_name, records in link_data.items(): if not records: continue link_model = getattr(model, link_name).rel.related_model if not is_record: id_field_name = link_name.removeprefix("links_").replace("_", "") else: id_field_name = "value" final_target_model = link_model._meta.get_field(id_field_name).related_model m2m_field_name = m2m_model_map.get( final_target_model.__get_name_with_module__() ) m2m_data[m2m_field_name] = { record[id_field_name]: record for record in records } return m2m_data related_data["m2m"] = convert_link_data_to_m2m( related_data["link"], model=model, m2m_model_map=m2m_model_to_field_map ) return { **{name: record_meta[name] for name in ["id", "uid"]}, "related_data": related_data, } def get_collection_with_related( collection: Collection, include_fk: bool = False, ) -> dict[str, Any]: """Fetch a collection with its related data.""" from .can_curate import get_name_field model = collection.__class__ schema_modules = get_schema_modules(collection._state.db) foreign_key_fields = [ f.name for f in model._meta.fields if f.is_relation and f.related_model.__get_module_name__() in schema_modules ] # Clear previous queries connection.queries_log.clear() annotations = {} if include_fk: for fk in foreign_key_fields: name_field = get_name_field(get_related_model(model, fk)) if fk == "run": annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}"), transform_key=F(f"{fk}__transform__key"), ) else: annotations[f"fkfield_{fk}"] = JSONObject( id=F(f"{fk}__id"), name=F(f"{fk}__{name_field}") ) collection_meta = ( model.objects.using(collection._state.db) .filter(uid=collection.uid) .annotate(**annotations) .values(*["id", "uid"], *annotations.keys()) .first() ) if not collection_meta: return None related_data: dict = {"fk": {}} for k, v in collection_meta.items(): if k.startswith("fkfield_") and v is not None: related_data["fk"][k[8:]] = v return { **{name: collection_meta[name] for name in ["id", "uid"]}, "related_data": related_data, } def get_schema_m2m_relations(artifact: Artifact, slot_schema: dict, limit: int = 20): """Fetch all many-to-many relationships for given feature sets.""" from .can_curate import get_name_field m2m_relations = [ v for v in dict_related_model_to_related_name(Schema).values() if v is not None and not v.startswith("_") and v != "artifacts" ] annotations = {} related_names = {} for name in m2m_relations: related_model = get_related_model(Schema, name) if related_model is Schema: # this is for the `type` field continue name_field = get_name_field(related_model) # Get the correct field names for the through table if not hasattr(getattr(Schema, name), "through"): continue through_model = getattr(Schema, name).through # Subquery to get limited related records limited_related = Subquery( through_model.objects.filter(schema=OuterRef("pk")).values( related_model.__name__.lower() )[:limit] ) annotations[f"m2mfield_{name}"] = ArrayAgg( JSONObject(id=F(f"{name}__id"), name=F(f"{name}__{name_field}")), filter=Q( **{ f"{name}__id__in": limited_related, } ), distinct=True, ) related_names[name] = related_model.__get_name_with_module__() schema_m2m = ( Schema.connect(artifact._state.db) .filter(id__in=slot_schema.keys()) .annotate(**annotations) .values("id", *annotations.keys()) ) result = {} for fs in schema_m2m: slot = slot_schema.get(fs["id"]) result[fs["id"]] = ( slot, { related_names.get(k[9:]): [item["name"] for item in v] for k, v in fs.items() if k.startswith("m2mfield_") and v }, ) return result patch_many_to_many_descriptor() ================================================ FILE: lamindb/models/_feature_manager.py ================================================ # ruff: noqa: TC004 from __future__ import annotations from collections import defaultdict from collections.abc import Iterable from datetime import date, datetime from itertools import compress from pathlib import Path from typing import TYPE_CHECKING, Any import numpy as np from django.contrib.postgres.aggregates import ArrayAgg from django.db import connections from django.db.models import Aggregate, Subquery from django.db.models.expressions import RawSQL from django.db.utils import IntegrityError from lamin_utils import logger from lamindb_setup.core.upath import UPath from lamindb_setup.errors import ModuleWasntConfigured from rich.table import Column, Table from rich.text import Text from rich.tree import Tree from lamindb.errors import DoesNotExist, InvalidArgument, ValidationError from lamindb.models._from_values import _format_values from lamindb.models.feature import ( serialize_pandas_dtype, suggest_categorical_for_str_iterable, ) from lamindb.models.has_parents import keep_topmost_matches from lamindb.models.save import save from lamindb.models.schema import DICT_KEYS_TYPE, Schema from lamindb.models.sqlrecord import ( REGISTRY_UNIQUE_FIELD, get_name_field, transfer_fk_to_default_db_bulk, transfer_to_default_db, ) from ._describe import ( NAME_WIDTH, TYPE_WIDTH, VALUES_WIDTH, describe_header, format_rich_tree, ) from ._django import get_artifact_or_run_with_related from ._label_manager import _get_labels from ._relations import ( dict_related_model_to_related_name, ) from .feature import Feature, FeaturePredicate, JsonValue, parse_dtype from .sqlrecord import SQLRecord from .ulabel import ULabel if TYPE_CHECKING: from rich.tree import Tree from lamindb.base.types import FieldAttr from lamindb.models import ( Artifact, Collection, IsLink, ) from lamindb.models.query_set import BasicQuerySet, SQLRecordList from ..base.types import DtypeObject from .record import Record from .run import Run def get_accessor_by_registry_(host: Artifact | Collection) -> dict: dictionary = { field.related_model.__get_name_with_module__(): field.name for field in host._meta.related_objects } dictionary["Feature"] = "features" dictionary["ULabel"] = "ulabels" dictionary["Record"] = "records" return dictionary def get_schema_by_slot_(host: Artifact) -> dict[str, Schema]: # if the host is not yet saved if host._state.adding: if hasattr(host, "_staged_schemas"): return host._staged_schemas else: return {} host_db = host._state.db kwargs = {"artifact_id": host.id} # otherwise, we need a query links_schema = ( host.schemas.through.objects.using(host_db) .filter(**kwargs) .select_related("schema") ) return {fsl.slot: fsl.schema for fsl in links_schema} def get_label_links( host: Artifact | Collection, registry: str, feature: Feature ) -> BasicQuerySet: kwargs = {"artifact_id": host.id, "feature_id": feature.id} link_records = ( getattr(host, host.features._accessor_by_registry[registry]) # type: ignore .through.objects.using(host._state.db) .filter(**kwargs) ) return link_records def get_schema_links(host: Artifact | Collection) -> BasicQuerySet: kwargs = {"artifact_id": host.id} links_schema = host.schemas.through.objects.filter(**kwargs) return links_schema def get_link_attr( link: IsLink | type[IsLink], data: Artifact | Collection | Run | type, ) -> str: link_model_name = link.__class__.__name__ if link_model_name in {"Registry", "ModelBase"}: # we passed the type of the link link_model_name = link.__name__ # type: ignore if link_model_name.startswith("Record") or link_model_name == "ArtifactArtifact": return "value" host_name = data.__name__ if isinstance(data, type) else data.__class__.__name__ return link_model_name.replace(host_name, "").lower() def get_categorical_link_info( host_class: type[SQLRecord], label_registry: type[SQLRecord], instance: str | None = None, ) -> tuple[type[SQLRecord], str, str]: """Resolve (link_model, value_field_name, filter_accessor_name) for (host_class, label_registry). Used by filter_base (categorical path) and _add_label_feature_links. """ host_name = host_class.__name__.lower() if host_name == "record": d = dict_related_model_to_related_name( host_class, links=True, instance=instance ) for rel in host_class._meta.related_objects: link_model = rel.related_model key = link_model.__get_name_with_module__() if key not in d: continue if not hasattr(link_model, "feature_id") or not hasattr( link_model, "value" ): continue value_fk = link_model._meta.get_field("value") if ( value_fk.remote_field is None or value_fk.remote_field.model != label_registry ): continue accessor = d[key] return (link_model, "value", accessor) raise ValueError( f"No categorical link model for Record + {label_registry.__name__}. " "Ensure the label registry has a Record* link model (e.g. RecordRecord, RecordULabel) " "or a bionty link model (e.g. RecordCellLine) in loaded schema modules." ) # Artifact, Run, or Collection attr_map = { "artifact": "artifacts", "run": "runs", "collection": "collections", } attr = attr_map.get(host_name) if not attr or not hasattr(label_registry, attr): raise ValueError( f"{label_registry.__name__} has no {attr or host_name!r} relation; " "cannot resolve categorical link for this host." ) through = getattr(label_registry, attr).through link_model = through host_fk = host_name # "artifact", "run", "collection" value_field = get_link_attr(link_model, host_class) filter_accessor = getattr(link_model, host_fk).field._related_name return (link_model, value_field, filter_accessor) def strip_cat(feature_dtype: str) -> str: if "cat[" in feature_dtype: parts = feature_dtype.split("cat[") dtype_stripped_cat = "".join( part[:-1] if i != 0 else part for i, part in enumerate(parts) ) else: dtype_stripped_cat = feature_dtype return dtype_stripped_cat def format_dtype_for_display(dtype_str: str) -> str: """Format dtype string for display, replacing Record[uid] or ULabel[uid] with Record[TypeName] or ULabel[TypeName].""" from .feature import parse_dtype from .record import Record from .ulabel import ULabel # Check if this is a Record[uid] or ULabel[uid] format if ("Record[" in dtype_str or "ULabel[" in dtype_str) and "]" in dtype_str: try: parsed = parse_dtype(dtype_str) if parsed and parsed[0].get("record_uid"): record_uid = parsed[0]["record_uid"] registry_str = parsed[0].get("registry_str", "") try: # Determine which registry to use if registry_str == "Record": record_type = Record.get(uid=record_uid) # Replace Record[uid] with Record[TypeName] dtype_str = dtype_str.replace( f"Record[{record_uid}]", f"Record[{record_type.name}]" ) elif registry_str == "ULabel": record_type = ULabel.get(uid=record_uid) # Replace ULabel[uid] with ULabel[TypeName] dtype_str = dtype_str.replace( f"ULabel[{record_uid}]", f"ULabel[{record_type.name}]" ) except Exception as e: # If we can't find the record, just return the original logger.debug( f"Could not find {registry_str} with uid '{record_uid}' for display formatting: {e}" ) except Exception as e: # If parsing fails, return the original logger.debug( f"Could not parse dtype string '{dtype_str}' for display formatting: {e}" ) return dtype_str # Custom aggregation for SQLite class GroupConcat(Aggregate): function = "GROUP_CONCAT" template = '%(function)s(%(expressions)s, ", ")' def custom_aggregate(field, using: str): if connections[using].vendor == "postgresql": return ArrayAgg(field) else: return GroupConcat(field) def get_categoricals_postgres( self: Artifact | Collection | Run, related_data: dict | None = None, ) -> dict[tuple[str, str], set[str]]: """Get categorical features and their values using PostgreSQL-specific optimizations.""" if related_data is None: if self.__class__.__name__ in {"Artifact", "Run", "Record"}: artifact_meta = get_artifact_or_run_with_related( self, include_feature_link=True, include_m2m=True ) related_data = artifact_meta.get("related_data", {}) else: related_data = {} # Process m2m data m2m_data = related_data.get("m2m", {}) if related_data else {} # e.g. m2m_data = {'tissues': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'cell_types': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}} # e.g. {'tissue': {1: {'id': 1, 'uid': '1fIFAQJY', 'abbr': None, 'name': 'brain', 'tissue': 1, 'feature': 1, 'ontology_id': 'UBERON:0000955', 'tissue_display': 'brain'}, 10: {'id': 2, 'uid': '7Tt4iEKc', 'abbr': None, 'name': 'lung', 'tissue': 10, 'feature': 1, 'ontology_id': 'UBERON:0002048', 'tissue_display': 'lung'}}, 'celltype': {1: {'id': 1, 'uid': '3QnZfoBk', 'abbr': None, 'name': 'neuron', 'feature': 2, 'celltype': 1, 'ontology_id': 'CL:0000540', 'celltype_display': 'neuron'}}} # integers are the ids of the related labels m2m_name = {} if not self.__class__.__name__ == "Record": for related_name, values in m2m_data.items(): link_model = getattr(self.__class__, related_name).through related_model_name = link_model.__name__.replace( self.__class__.__name__, "", 1 ).lower() if related_model_name == "artifact": related_model_name = "value" m2m_name[related_model_name] = values else: m2m_name = related_data.get("m2m", {}) # Get feature information links_data = related_data.get("link", {}) if related_data else {} # e.g. feature_dict = {1: ('tissue', 'cat[bionty.Tissue.ontology_id]'), 2: ('cell_type', 'cat[bionty.CellType]')} feature_dict = { id: (name, dtype) for id, name, dtype in Feature.connect(self._state.db).values_list( "id", "name", "_dtype_str" ) } # Build result dictionary result = {} # type: ignore for link_name, link_values in links_data.items(): related_name = link_name.removeprefix("links_").replace("_", "") if not link_values: continue # sort by the order on the link table, important for list dtypes for link_value in sorted(link_values, key=lambda x: x.get("id")): feature_id = link_value.get("feature") if feature_id is None: continue feature_name, feature_dtype = feature_dict.get(feature_id) feature_field = parse_dtype(feature_dtype)[0]["field_str"] if not self.__class__.__name__ == "Record": label_id = link_value.get(related_name) label_name = ( m2m_name.get(related_name, {}).get(label_id, {}).get(feature_field) ) else: label_name = link_value.get(feature_field) if label_name: dict_key = (feature_name, feature_dtype) if dict_key not in result: result[dict_key] = ( set() if not feature_dtype.startswith("list[cat") else [] ) if feature_dtype.startswith("list[cat"): result[dict_key].append(label_name) else: result[dict_key].add(label_name) return dict(result) def get_categoricals_sqlite( self: Artifact | Collection, ) -> dict[tuple[str, str], set[str]]: """Get categorical features and their values using the default approach.""" from .query_set import get_default_branch_ids result = {} # type: ignore for _, links in _get_labels(self, links=True, instance=self._state.db).items(): for link in links: if link.__class__.__name__ == "RecordJson": continue if hasattr(link, "feature_id") and link.feature_id is not None: feature = Feature.objects.using(self._state.db).get(id=link.feature_id) dtype_str = feature._dtype_str feature_field = parse_dtype(dtype_str)[0]["field_str"] link_attr = get_link_attr(link, self) label = getattr(link, link_attr) if hasattr(label, "branch_id"): if label.branch_id not in get_default_branch_ids(): continue label_name = getattr(label, feature_field) dict_key = (feature.name, dtype_str) if dict_key not in result: result[dict_key] = ( set() if not dtype_str.startswith("list[cat") else [] ) if dtype_str.startswith("list[cat"): result[dict_key].append(label_name) else: result[dict_key].add(label_name) return dict(result) def get_non_categoricals( self, ) -> dict[tuple[str, str], set[Any]]: """Get non-categorical features and their values.""" import pandas as pd from .artifact import Artifact from .record import Record from .run import Run non_categoricals = {} if self.id is not None and isinstance(self, (Artifact, Run, Record)): if isinstance(self, Record): json_values = self.values_json.values( "feature__name", "feature___dtype_str", "value" ).order_by("feature__name") else: json_values = ( self.json_values.values("feature__name", "feature___dtype_str") .annotate(values=custom_aggregate("value", self._state.db)) .order_by("feature__name") ) for fv in json_values: feature_name = fv["feature__name"] feature_dtype = fv["feature___dtype_str"] if isinstance(self, Record): values = fv["value"] else: values = fv["values"] if connections[self._state.db].vendor == "sqlite": # undo GROUP_CONCAT if isinstance(values, str): values = {value.strip('"') for value in values.split(", ")} # Convert single values to sets if not isinstance(values, (list, dict, set)): values = {values} elif ( isinstance(values, list) and feature_dtype != "dict" and not feature_dtype.startswith("list") ): try: values = set(values) except TypeError: # TypeError: unhashable type: 'list' if values is list[list] pass # Handle special datetime types if feature_dtype == "datetime": values = {datetime.fromisoformat(value) for value in values} if feature_dtype == "date": # date.fromisoformat() cannot handle cases like 2025-01-17T00:00:00.000Z values = { pd.to_datetime(value, format="ISO8601").date() for value in values } if connections[self._state.db].vendor == "sqlite": # undo GROUP_CONCAT if feature_dtype == "int": values = {int(value) for value in values} if feature_dtype == "float": values = {float(value) for value in values} if feature_dtype == "num": values = {float(value) for value in values} non_categoricals[(feature_name, feature_dtype)] = values return non_categoricals def create_feature_table( name: str, registry_str: str, data: list, show_header: bool = False ) -> Table: """Create a Rich table for a feature group.""" table = Table( Column(name, style="", no_wrap=True, width=NAME_WIDTH), Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH), Column("", width=VALUES_WIDTH, no_wrap=True), show_header=show_header, box=None, pad_edge=False, ) for row in data: table.add_row(*row) return table def get_features_data( self: Artifact | Run | Record, related_data: dict | None = None, to_dict: bool = False, external_only: bool = False, ): from .artifact import Artifact dictionary: dict[str, Any] = {} if self._state.adding: if to_dict: return dictionary else: raise NotImplementedError # feature sets schema_data: dict[str, tuple[str, list[str]]] = {} feature_data: dict[str, tuple[str, list[str]]] = {} if not to_dict and isinstance(self, Artifact): if self.id is not None and connections[self._state.db].vendor == "postgresql": if not related_data: artifact_meta = get_artifact_or_run_with_related( self, include_schema=True, include_m2m=True, include_feature_link=True, ) related_data = artifact_meta.get("related_data", {}) fs_data = related_data.get("m2m_schemas", {}) if related_data else {} for fs_id, (slot, data) in fs_data.items(): for registry_str, feature_names in data.items(): # prevent projects show up as features if registry_str == "Project": continue schema = Schema.objects.using(self._state.db).get(id=fs_id) schema_data[slot] = (schema, feature_names) for feature_name in feature_names: feature_data[feature_name] = (slot, registry_str) schema_data.update( { slot: (schema, schema.n_members) # type: ignore for slot, schema in get_schema_by_slot_(self).items() if slot not in schema_data } ) else: for slot, schema in get_schema_by_slot_(self).items(): features = schema.members if features.exists(): # features.first() is a lot slower than features[0] here name_field = get_name_field(features[0]) feature_names = list( features.values_list(name_field, flat=True)[:20] ) schema_data[slot] = (schema, feature_names) for feature_name in feature_names: feature_data[feature_name] = (slot, schema.itype) else: schema_data[slot] = (schema, schema.n_members) internal_feature_names = {} if isinstance(self, Artifact): inferred_schemas = self.schemas.filter(itype="Feature") if len(inferred_schemas) > 0: for schema in inferred_schemas: # Use _dtype_str instead of dtype, and format for display feature_dtypes = dict(schema.members.values_list("name", "_dtype_str")) # Format Record[uid] to Record[TypeName] for display formatted_dtypes = { name: format_dtype_for_display(dtype_str) if dtype_str else "" for name, dtype_str in feature_dtypes.items() } internal_feature_names.update(formatted_dtypes) # categorical feature values # Get the categorical data using the appropriate method # e.g. categoricals = {('tissue', 'cat[bionty.Tissue.ontology_id]'): {'brain'}, ('cell_type', 'cat[bionty.CellType]'): {'neuron'}} if not self._state.adding and connections[self._state.db].vendor == "postgresql": categoricals = get_categoricals_postgres( self, related_data=related_data, ) else: categoricals = get_categoricals_sqlite( self, ) # Get non-categorical features non_categoricals = get_non_categoricals( self, ) internal_feature_labels = {} external_data = [] for features, is_categoricals in [(categoricals, True), (non_categoricals, False)]: for (feature_name, feature_dtype), values in sorted(features.items()): # Handle dictionary conversion if feature_dtype.startswith("list[cat"): converted_values = values # is already a list else: converted_values = values if len(values) > 1 else next(iter(values)) if to_dict: dictionary[feature_name] = converted_values continue # Format message if is_categoricals and isinstance(converted_values, set): printed_values = _format_values( sorted(converted_values), n=10, quotes=False ) elif ( not is_categoricals and not feature_dtype.startswith(("list", "dict")) and isinstance(converted_values, set) ): printed_values = _format_values( sorted(converted_values), n=10, quotes=False ) else: printed_values = str(converted_values) # Format dtype for display (replace Record[uid] with Record[TypeName]) display_dtype = format_dtype_for_display(feature_dtype) # Sort into internal/external feature_info = ( feature_name, Text(strip_cat(display_dtype), style="dim"), printed_values, ) if feature_name in internal_feature_names: internal_feature_labels[feature_name] = feature_info else: external_data.append(feature_info) if to_dict: if external_only: return { k: v for k, v in dictionary.items() if k not in internal_feature_names } else: return dictionary else: return ( internal_feature_labels, feature_data, schema_data, internal_feature_names, external_data, ) def describe_features( self: Artifact | Run | Record, related_data: dict | None = None, ) -> tuple[Tree | None, Tree | None]: """Describe features of an artifact or collection.""" if self._state.adding: return None, None ( internal_feature_labels, feature_data, schema_data, internal_feature_names, external_data, ) = get_features_data( self, related_data=related_data, ) # Dataset features section # internal features that contain labels (only `Feature` features contain labels) internal_feature_labels_slot: dict[str, list] = {} for feature_name, feature_row in internal_feature_labels.items(): slot, _ = feature_data.get(feature_name) internal_feature_labels_slot.setdefault(slot, []).append(feature_row) dataset_features_tree_children = [] for slot, (schema, feature_names_or_n) in schema_data.items(): if feature_names_or_n is None or isinstance(feature_names_or_n, int): feature_rows = [] else: feature_names = feature_names_or_n if slot in internal_feature_labels_slot: # add internal Feature features with labels feature_rows = internal_feature_labels_slot[slot] # add internal Feature features without labels feature_rows += [ ( feature_name, Text( strip_cat(internal_feature_names.get(feature_name)), style="dim", ), "", ) for feature_name in feature_names if feature_name and feature_name not in internal_feature_labels ] else: # add internal non-Feature features without labels feature_rows = [ ( feature_name, Text( strip_cat( internal_feature_names.get(feature_name) if feature_name in internal_feature_names else schema.dtype ), style="dim", ), "", ) for feature_name in feature_names if feature_name ] feature_rows.sort(key=lambda x: x[0]) schema_itype = f" {schema.itype}" if schema.itype != "Feature" else "" dataset_features_tree_children.append( create_feature_table( Text.assemble( (slot, "violet"), (f" ({schema.n_members}{schema_itype})", "dim"), ), "", feature_rows, show_header=True, ) ) # external features external_features_tree_children = [] if external_data: external_features_tree_children.append( create_feature_table( "", "", external_data, ) ) # trees dataset_features_tree = None if dataset_features_tree_children: dataset_features_tree = Tree( Text("Dataset features", style="bold bright_magenta") ) for child in dataset_features_tree_children: dataset_features_tree.add(child) external_features_tree = None if external_features_tree_children: external_features_text = ( "External features" if ( self.__class__.__name__ == "Artifact" and dataset_features_tree_children ) else "Features" ) external_features_tree = Tree( Text(external_features_text, style="bold dark_orange") ) for child in external_features_tree_children: external_features_tree.add(child) return dataset_features_tree, external_features_tree def infer_convert_dtype_key_value( key: str, value: Any, mute: bool = False, dtype_str: str | None = None ) -> tuple[str, Any, str]: import pandas as pd from lamindb.base.dtypes import is_valid_datetime_str message = "" if isinstance(value, bool): return "bool", value, message elif isinstance(value, int): return "int", value, message elif isinstance(value, float): return "float", value, message elif isinstance(value, datetime): return "datetime", value.isoformat(), message elif isinstance(value, date): return "date", value.isoformat(), message elif isinstance(value, str): if dtype_str in {None, "datetime", "date"} and ( datetime_str := is_valid_datetime_str(value) ): dt_type = ( "date" if len(value) == 10 else "datetime" ) # YYYY-MM-DD is exactly 10 characters sanitized_value = datetime_str[:10] if dt_type == "date" else datetime_str # type: ignore return dt_type, sanitized_value, message # type: ignore else: return "cat ? str", value, message elif isinstance(value, SQLRecord): # SQLRecord is not converted to JSON return (f"cat[{value.__class__.__get_name_with_module__()}]", value, message) elif isinstance(value, (Path, UPath)): return "path", value.as_posix().rstrip("/"), message elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)): if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)): dtype = serialize_pandas_dtype(value.dtype) if dtype == "str": # ndarray doesn't know categorical, so there was no conscious choice # offer both options if isinstance(value, np.ndarray): dtype = "cat ? str" else: # suggest to create a categorical if there are few unique values message = suggest_categorical_for_str_iterable(value, key) if message: message = f" # {message}" return dtype, list(value), message if isinstance(value, dict): return "dict", value, message if len(value) > 0: # type: ignore first_element = next(iter(value)) first_element_type = type(first_element) # check that all elements are of the same type if all(isinstance(elem, first_element_type) for elem in value): if first_element_type is bool: return "list[bool]", value, message elif first_element_type is int: return "list[int]", value, message elif first_element_type is float: return "list[float]", value, message elif first_element_type is str: return ("list[cat ? str]", value, message) elif isinstance(first_element, SQLRecord): return ( f"list[cat[{first_element_type.__get_name_with_module__()}]]", value, message, ) if not mute: logger.warning(f"cannot infer feature type of: {value}, returning '?'") return "?", value, message def _filter_one_feature_clause( queryset: BasicQuerySet, feature: Feature, comparator: str, value: Any, ) -> BasicQuerySet: from lamindb.models import Artifact from lamindb.models.record import Record, RecordJson from lamindb.models.run import Run dtype_str = feature._dtype_str # non-categorical features if not dtype_str.startswith("cat") and not dtype_str.startswith("list[cat"): if comparator == "__isnull": if queryset.model is Artifact: from .artifact import ArtifactJsonValue value_subquery = ArtifactJsonValue.objects.filter( jsonvalue__feature=feature ).values("artifact_id") return queryset.exclude(id__in=Subquery(value_subquery)) if comparator in {"__startswith", "__contains"}: logger.important( f"currently not supporting `{comparator}`, using `__icontains` instead" ) comparator = "__icontains" use_numeric_sqlite = ( connections[feature._state.db].vendor == "sqlite" and comparator in {"__gt", "__lt", "__gte", "__lte"} and dtype_str in ("int", "float", "num") ) if use_numeric_sqlite: # Numeric comparison via json_extract + CAST (avoids lexicographic comparison) num_val_raw = RawSQL("CAST(json_extract(value, '$') AS REAL)", ()) if queryset.model is Record: value_qs = ( RecordJson.objects.using(queryset.db) .filter(feature=feature) .annotate(num_val=num_val_raw) .filter(**{f"num_val{comparator}": value}) ) return queryset.filter(values_json__id__in=value_qs) else: json_values = ( JsonValue.objects.using(queryset.db) .filter(feature=feature) .annotate(num_val=num_val_raw) .filter(**{f"num_val{comparator}": value}) ) accessor = ( "json_values" if queryset.model in {Artifact, Run} else "values_json" ) return queryset.filter(**{f"{accessor}__id__in": json_values}) else: if connections[feature._state.db].vendor == "sqlite" and comparator in { "__gt", "__lt", "__gte", "__lte", }: # SQLite: lexicographic comparison for non-numeric dtypes (date, datetime, str) value = str(value) filter_expr = {"feature": feature, f"value{comparator}": value} if queryset.model is Record: value_qs = RecordJson.objects.using(queryset.db).filter(**filter_expr) return queryset.filter(values_json__id__in=value_qs) else: json_values = JsonValue.objects.using(queryset.db).filter(**filter_expr) accessor = ( "json_values" if queryset.model in {Artifact, Run} else "values_json" ) return queryset.filter(**{f"{accessor}__id__in": json_values}) # categorical features elif isinstance(value, (str, SQLRecord, bool)): result = parse_dtype(dtype_str)[0] label_registry = result["registry"] _, value_field_name, filter_accessor_name = get_categorical_link_info( queryset.model, label_registry, instance=queryset.db ) if comparator == "__isnull": kwargs = {f"{filter_accessor_name}__feature": feature} if value: # True return queryset.exclude(**kwargs) else: return queryset.filter(**kwargs) # because SQL is sensitive to whether querying with __in or not # and might return multiple equivalent records for the latter # we distinguish cases in which we have multiple label matches vs. one label = None labels = None if isinstance(value, str): field_name = result["field"].field.name # users might query like so: # ln.Artifact.filter(experiment__contains="Experi") expression = {f"{field_name}{comparator}": value} labels = result["registry"].connect(queryset.db).filter(**expression) if len(labels) == 0: raise DoesNotExist( f"Did not find a {label_registry.__name__} matching `{field_name}{comparator}={value}`" ) elif len(labels) == 1: label = labels[0] elif isinstance(value, SQLRecord): label = value new_expression = {f"{filter_accessor_name}__feature": feature} if label is not None: new_expression[f"{filter_accessor_name}__{value_field_name}"] = label else: new_expression[f"{filter_accessor_name}__{value_field_name}__in"] = labels return queryset.filter(**new_expression) raise NotImplementedError def filter_with_feature_predicates( queryset: BasicQuerySet, predicates: list[FeaturePredicate], ) -> BasicQuerySet: qs = queryset pk_name = qs.model._meta.pk.name for predicate in predicates: feature = predicate.feature if qs.db is not None and feature._state.db != qs.db: feature = Feature.connect(qs.db).get(uid=feature.uid) if predicate.comparator == "__ne": subset = _filter_one_feature_clause( qs, feature=feature, comparator="", value=predicate.value ) qs = qs.exclude(**{f"{pk_name}__in": Subquery(subset.values(pk_name))}) else: qs = _filter_one_feature_clause( qs, feature=feature, comparator=predicate.comparator, value=predicate.value, ) return qs def filter_base( queryset: BasicQuerySet, _skip_validation: bool = True, **expression, ) -> BasicQuerySet: from lamindb.models import BasicQuerySet, QuerySet assert isinstance(queryset, BasicQuerySet) and not isinstance(queryset, QuerySet) # noqa: S101 keys_normalized = [key.split("__")[0] for key in expression] if not _skip_validation: validated = Feature.connect(queryset.db).validate( keys_normalized, field="name", mute=True ) if sum(validated) != len(keys_normalized): raise ValidationError( f"Some keys in the filter expression are not registered as features: {np.array(keys_normalized)[~validated]}" ) features = Feature.connect(queryset.db).filter(name__in=keys_normalized).distinct() qs = queryset for key, value in expression.items(): split_key = key.split("__") normalized_key = split_key[0] comparator = "" if len(split_key) == 2: comparator = f"__{split_key[1]}" feature = features.get(name=normalized_key) qs = _filter_one_feature_clause( qs, feature=feature, comparator=comparator, value=value ) if qs is queryset: raise NotImplementedError return qs def filter_with_features( queryset: BasicQuerySet, *queries, **expressions ) -> BasicQuerySet: from lamindb.models import BasicQuerySet, QuerySet feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)] non_feature_queries = [q for q in queries if not isinstance(q, FeaturePredicate)] if isinstance(queryset, QuerySet): # need to avoid infinite recursion because # filter_with_features is called in queryset.filter otherwise filter_kwargs = {"_skip_filter_with_features": True} else: filter_kwargs = {} registry = queryset.model qs = queryset if expressions: keys_normalized = [key.split("__")[0] for key in expressions] field_or_feature = keys_normalized[0] if field_or_feature in registry.__get_available_fields__(): qs = queryset.filter(*non_feature_queries, **expressions, **filter_kwargs) elif all( features_validated := Feature.objects.using(queryset.db).validate( keys_normalized, field="name", mute=True ) ): # filter_base requires qs to be BasicQuerySet qs = filter_base( queryset._to_class(BasicQuerySet, copy=True), _skip_validation=True, **expressions, )._to_class(type(queryset), copy=False) qs = qs.filter(*non_feature_queries, **filter_kwargs) else: features = ", ".join(sorted(np.array(keys_normalized)[~features_validated])) message = f"feature names: {features}" avail_fields = registry.__get_available_fields__() fields = ", ".join(sorted(avail_fields)) raise InvalidArgument( f"You can query either by available fields: {fields}\n" f"Or fix invalid {message}" ) else: # Always route through `.filter()` here (even when empty) so the # standard QuerySet path can inject default branch constraints. qs = queryset.filter(*non_feature_queries, **filter_kwargs) if feature_predicates: qs = filter_with_feature_predicates( qs._to_class(BasicQuerySet, copy=True), feature_predicates, )._to_class(type(qs), copy=False) return qs class FeatureManager: """Feature manager.""" def __init__(self, sqlrecord: Artifact | Run | Record): # host is the sqlrecord that the label manager is attached to # we might rename _host to _sqlrecord in the future self._host = sqlrecord self._slots: dict[str, Schema] | None = None self._accessor_by_registry_ = None def __repr__(self) -> str: return self.describe(return_str=True) # type: ignore def describe(self, return_str: bool = False) -> str | None: """Pretty print features. This is what `artifact.describe()` calls under the hood. """ dataset_features_tree, external_features_tree = describe_features(self._host) # type: ignore tree = describe_header(self._host) if dataset_features_tree: tree.add(dataset_features_tree) if external_features_tree: tree.add(external_features_tree) return format_rich_tree(tree, return_str=return_str) def get_values(self, external_only: bool = False) -> dict[str, Any]: """Get features as a dictionary. Includes annotation with internal and external feature values. Args: external_only: If `True`, only return external feature annotations. """ return get_features_data(self._host, to_dict=True, external_only=external_only) # type: ignore def __getitem__( self, feature: str ) -> ( DtypeObject | BasicQuerySet | SQLRecord | SQLRecordList | dict[str, DtypeObject | BasicQuerySet | SQLRecord | SQLRecordList] ): """Get values by feature name. Args: feature: Feature name. Returns: - For categorical features, return value records. - For non-categorical features, return values. Example:: artifact.features['tissue'] #> Tissue(id=1, name='brain', ...) """ from collections import defaultdict import pandas as pd from .query_set import SQLRecordList host_name = self._host.__class__.__name__ host_id = self._host.id host_db = self._host._state.db feature_records = list(Feature.objects.using(host_db).filter(name=feature)) if not feature_records: raise ValidationError(f"Feature with name {feature} not found") # group cat feature_records by their registry registry_to_features = defaultdict(list) for feature_record in feature_records: parsed_dtype = parse_dtype(feature_record._dtype_str) if len(parsed_dtype) > 0: # categorical features registry = parsed_dtype[0]["registry"] registry_name = registry.__get_name_with_module__() registry_to_features[(registry, registry_name)].append( feature_record.id ) else: # non-categorical features registry_to_features[(JsonValue, "JsonValue")].append(feature_record.id) value_records = {} # query once per registry with all feature_ids for (registry, registry_name), feature_ids in registry_to_features.items(): if registry_name == "JsonValue": # for non-categorical features filters = { "feature_id__in": feature_ids, f"links_{host_name.lower()}__{host_name.lower()}_id": host_id, } dtype_values = ( registry.objects.using(host_db) .filter(**filters) .distinct() .values_list("feature___dtype_str", "value") ) feature_values_qs = [] for dtype, value in dtype_values: if dtype == "date": value = pd.to_datetime(value, format="ISO8601").date() elif dtype == "datetime": value = datetime.fromisoformat(value) feature_values_qs.append(value) else: # determine links name once per registry links_value_name = ( "links_value" if registry_name == host_name else f"links_{host_name.lower()}" ) filters = { f"{links_value_name}__feature_id__in": feature_ids, f"{links_value_name}__{host_name.lower()}_id": host_id, } feature_values_qs = ( registry.objects.using(host_db).filter(**filters).distinct() ) if len(feature_values_qs) == 1: value_records[registry_name] = feature_values_qs[0] elif len(feature_values_qs) > 1: if feature_record.dtype_as_str.startswith("list["): value_records[registry_name] = SQLRecordList(feature_values_qs) else: value_records[registry_name] = feature_values_qs return ( next(iter(value_records.values())) if len(value_records) == 1 else value_records ) @property def slots(self) -> dict[str, Schema]: """Features by schema slot. Example:: artifact.features.slots #> {'var': , 'obs': } """ if self._slots is None: self._slots = get_schema_by_slot_(self._host) return self._slots @property def _accessor_by_registry(self): """Accessor by registry.""" if self._accessor_by_registry_ is None: self._accessor_by_registry_ = get_accessor_by_registry_(self._host) return self._accessor_by_registry_ def _add_label_feature_links( self, features_labels, ): host_name = self._host.__class__.__name__.lower() host_is_record = host_name == "record" instance = getattr(self._host._state, "db", None) for class_name, registry_features_labels in features_labels.items(): if not host_is_record and class_name == "Collection": continue registry_features_labels[0][0] label_registry = registry_features_labels[0][1].__class__ link_model, value_field_name, _ = get_categorical_link_info( self._host.__class__, label_registry, instance=instance ) field_name = f"{value_field_name}_id" host_fk = f"{host_name}_id" links = [ link_model( **{ host_fk: self._host.id, "feature_id": ftr.id, field_name: label.id, } ) for (ftr, label) in registry_features_labels ] try: save(links, ignore_conflicts=False) except Exception: save(links, ignore_conflicts=True) def _get_feature_objects(self, dictionary, feature_field): from ..core._functions import get_current_tracked_run registry = feature_field.field.model keys = list(dictionary.keys()) feature_objects = registry.from_values(keys, field=feature_field, mute=True) feature_objects = keep_topmost_matches(feature_objects) if len(feature_objects) != len(keys): not_validated_keys = [ key for key in keys if key not in feature_objects.to_list("name") ] not_validated_keys_dtype_message = [ (key, infer_convert_dtype_key_value(key, dictionary[key])) for key in not_validated_keys ] run = get_current_tracked_run() if run is not None: name = f"{run.transform.kind}[{run.transform.key}]" type_hint = f""" feature_type = ln.Feature(name='{name}', is_type=True).save()""" elements = [type_hint] type_kwarg = ", type=feature_type" else: elements = [] type_kwarg = "" elements += [ f" ln.Feature(name='{key}', dtype='{dtype}'{type_kwarg}).save(){message}" for key, (dtype, _, message) in not_validated_keys_dtype_message ] hint = "\n".join(elements) msg = ( f"These keys could not be validated: {not_validated_keys}\n" f"Here is how to create a feature:\n\n{hint}" ) raise ValidationError(msg) return feature_objects def _resolve_feature_value_dictionary( self, values: dict[str | Feature, Any], ) -> tuple[dict[str, Any], dict[str, Any], list[Feature], dict[str, Any]]: """Normalize a feature-value dictionary to support `str` and `Feature` keys. Returns: normalized_values: Values keyed by feature name (used by schema validators). string_key_values: Subset of values that came from string keys only. explicit_features: Resolved Feature objects passed explicitly as keys. values_by_feature_uid: Values keyed by feature uid (used for exact lookup). """ host_db = self._host._state.db normalized_values: dict[str, Any] = {} string_key_values: dict[str, Any] = {} explicit_features: list[Feature] = [] values_by_feature_uid: dict[str, Any] = {} seen_explicit_uids: set[str] = set() for key, value in values.items(): if isinstance(key, Feature): if key._state.adding: raise ValidationError( f"Please save feature '{key.name}' before annotation." ) feature = key # Mirror feature predicate resolution: resolve Feature objects on active DB. if host_db is not None and feature._state.db != host_db: feature = Feature.connect(host_db).get(uid=feature.uid) if feature.uid in values_by_feature_uid and ( values_by_feature_uid[feature.uid] != value ): raise ValidationError( f"Conflicting values for feature '{feature.name}'." ) values_by_feature_uid[feature.uid] = value if feature.uid not in seen_explicit_uids: explicit_features.append(feature) seen_explicit_uids.add(feature.uid) if ( feature.name in normalized_values and normalized_values[feature.name] != value ): raise ValidationError( f"Conflicting values for feature name '{feature.name}'." ) normalized_values[feature.name] = value elif isinstance(key, str): if key in normalized_values and normalized_values[key] != value: raise ValidationError( f"Conflicting values for feature name '{key}'." ) normalized_values[key] = value string_key_values[key] = value else: raise TypeError( "Feature-value dictionary keys must be `str` or `Feature`, " f"got {type(key)}" ) return ( normalized_values, string_key_values, explicit_features, values_by_feature_uid, ) @staticmethod def _merge_feature_objects( explicit_features: list[Feature], looked_up_features, ) -> list[Feature]: merged: list[Feature] = [] seen_uids: set[str] = set() for feature in explicit_features: if feature.uid not in seen_uids: merged.append(feature) seen_uids.add(feature.uid) for feature in looked_up_features: if feature.uid not in seen_uids: merged.append(feature) seen_uids.add(feature.uid) return merged @staticmethod def _raise_not_validated_values( not_validated_values: dict[str, tuple[str, list[str]]], ) -> None: if not not_validated_values: return None hint = "" for key, (field, values_list) in not_validated_values.items(): key_str = "ln.Record" if key == "Record" else key create_true = ", create=True" if "bionty." not in key else "" hint += f" records = {key_str}.from_values({values_list}, field='{field}'{create_true}).save()\n" msg = ( f"These values could not be validated: {dict(not_validated_values)}\n" f"Here is how to create records for them:\n\n{hint}" ) raise ValidationError(msg) def _collect_record_feature_writes( self, *, record, feature_objects: list[Feature], dictionary: dict[str, Any], values_by_feature_uid: dict[str, Any] | None, feature_json_values: list, links_by_model: dict, not_validated_values: dict[str, tuple[str, list[str]]], resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]] | None = None, ) -> None: from ..base.dtypes import is_iterable_of_sqlrecord from .can_curate import CanCurate from .record import RecordJson for feature in feature_objects: if ( values_by_feature_uid is not None and feature.uid in values_by_feature_uid ): value = values_by_feature_uid[feature.uid] else: value = dictionary[feature.name] if value is None: continue if not ( feature.dtype_as_str.startswith("cat") or feature.dtype_as_str.startswith("list[cat") ): _, converted_value, _ = infer_convert_dtype_key_value( key=feature.name, value=value, dtype_str=feature.dtype_as_str ) feature_json_values.append( RecordJson(record=record, feature=feature, value=converted_value) ) continue if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value): if isinstance(value, SQLRecord): label_records = [value] else: label_records = value # type: ignore else: if isinstance(value, str): values = [value] # type: ignore else: values = value # type: ignore if feature._dtype_str == "cat": feature._dtype_str = "cat[ULabel]" feature.save() result = { "registry_str": "ULabel", "registry": ULabel, "field": ULabel.name, } else: result = parse_dtype(feature._dtype_str)[0] # Fast path for dataframe-originated record batches: # `bulk_set_features_in_records()` now runs a single `DataFrameCurator` # pass and pre-resolves categorical values to label records. # # The cache key is feature.id and the nested key is the normalized # raw value found in the dataframe. Using this cache here avoids # running per-row `validate()` + `from_values()` calls, which used # to duplicate work already done by the curator. cached_records = None if ( resolved_records_by_feature_id is not None and feature.id in resolved_records_by_feature_id ): cached_records = resolved_records_by_feature_id[feature.id] if cached_records is not None: if isinstance(value, str): values_for_lookup = [value] else: values_for_lookup = value # type: ignore if isinstance(values_for_lookup, (list, tuple, np.ndarray, set)): values_for_lookup = list(values_for_lookup) else: values_for_lookup = [values_for_lookup] label_records = [] not_validated_for_feature = [] for lookup_value in values_for_lookup: normalized_lookup = ( lookup_value.item() if isinstance(lookup_value, np.generic) else lookup_value ) mapped_records = cached_records.get(normalized_lookup) if mapped_records is None: # Keep the same error aggregation behavior as before: # unresolved categorical values are collected and raised # in one ValidationError after all records are processed. not_validated_for_feature.append(normalized_lookup) else: label_records.extend(mapped_records) if not_validated_for_feature: not_validated_values[result["registry_str"]] = ( # type: ignore result["field_str"], not_validated_for_feature, ) elif issubclass(result["registry"], CanCurate): # type: ignore # Fallback path for non-batch callers (e.g. direct # `record.features.add_values()` on an individual record). # # Those flows do not build dataframe-level caches, so we keep # the original registry-backed validation and resolution logic. # This branch should not be hot for the dataframe batch import # path because that path provides `resolved_records_by_feature_id`. validated = result["registry"].validate( # type: ignore values, field=result["field"], mute=True ) values_array = np.array(values) validated_values = values_array[validated] if validated.sum() != len(values): not_validated_values[result["registry_str"]] = ( # type: ignore result["field_str"], values_array[~validated].tolist(), ) label_records = result["registry"].from_values( # type: ignore validated_values, field=result["field"], mute=True ) else: label_records = result["registry"].filter( # type: ignore **{f"{result['field_str']}__in": values} ) if len(label_records) != len(values): raise ValidationError( f"Some of these values for {result['registry_str']} do not exist: {values}" ) for label_record in label_records: if label_record._state.adding: raise ValidationError( f"Please save {label_record} before annotation." ) link_model, value_field_name, _ = get_categorical_link_info( record.__class__, label_record.__class__, instance=getattr(record._state, "db", None), ) links_by_model[link_model].append( link_model( record_id=record.id, feature_id=feature.id, **{f"{value_field_name}_id": label_record.id}, ) ) return None def add_values( self, values: dict[str | Feature, Any], feature_field: FieldAttr = Feature.name, schema: Schema = None, ) -> None: """Add values for features. Like `set_values()`, but slightly more performant because it does not remove previously-existing feature annotations at the danger of violating multiplicity of categorical dtypes (see warning below). Args: values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.). Keys can be feature names (`str`) or `Feature` objects. If a value is `None`, it will be skipped. feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed. schema: Schema to validate against. .. warning:: If you run:: obj.features.add_values({"my_categorical": "my_category1"}) obj.features.add_values({"my_categorical": "my_category2"}) you will annotate the object with two different values for the same feature even if its dtype is not a `list`. That is, `add_values()` does **not** validate the `dtype` of a categorical feature across multiple calls. To avoid this, please use `set_values()`. .. dropdown:: Why is multiplicity of categorical dtypes not validated? For simple data types like `int`, `date`, `dict`, etc., `add_values()` ensures that there is only one value for a given `Record` and feature. But for categorical/relational features or for simple dtypes in the context of annotating an `Artifact`, the underlying link table allows linking multiple values to the same object and feature, so that both `list` dtypes and `set`-like aggregations on an object can be represented with relational integrity. Examples:: # the following needs to be allowed even if `cell_type` has dtype `CellType`, and not `list[CellType]` # this is because the artifact might be a `DataFrame` with a column `cell_type` that has dtype `CellType` # and the annotations on the artifact-level represent the aggregation of all values in that column artifact.features.add_values({"cell_type": "B cell"}) artifact.features.add_values({"cell_type": "T cell"}) artifact.features.add_values({"cell_type": "NK cell"}) # now an example for Record # while a record will never represent an aggregation, we still want to express # lists of values with relational integrity, for instance, this record.features.add_values({"cell_types": ["B cell", "T cell", "NK cell"]}) """ from lamindb.curators.core import ExperimentalDictCurator host_is_record = self._host.__class__.__name__ == "Record" host_is_artifact = self._host.__class__.__name__ == "Artifact" # rename to distinguish from the values inside the dict ( dictionary, string_key_values, explicit_features, values_by_feature_uid, ) = self._resolve_feature_value_dictionary(values) keys = dictionary.keys() if isinstance(keys, DICT_KEYS_TYPE): keys = list(keys) # type: ignore if ( host_is_record and self._host.type is not None and self._host.type.schema is not None # type: ignore ): assert schema is None, "Cannot pass schema if record.type has schema." schema = self._host.type.schema # type: ignore if host_is_artifact: if self._get_external_schema(): raise ValueError("Cannot add values if artifact has external schema.") if schema is not None: member_ids = set(schema.members.values_list("id", flat=True)) features_not_in_schema = [ feature.name for feature in explicit_features if feature.id not in member_ids ] if features_not_in_schema: raise ValidationError( "These feature keys are not in the provided schema: " f"{features_not_in_schema}" ) looked_up_features = schema.members.filter(name__in=keys) feature_objects = self._merge_feature_objects( explicit_features, looked_up_features ) else: if string_key_values: looked_up_features = self._get_feature_objects( string_key_values, feature_field ) else: looked_up_features = Feature.objects.none() feature_objects = self._merge_feature_objects( explicit_features, looked_up_features ) schema = Schema(feature_objects) ExperimentalDictCurator( dictionary, schema, require_saved_schema=False ).validate() return self._add_values( feature_objects, dictionary, values_by_feature_uid=values_by_feature_uid, ) def _add_values( self, feature_objects, dictionary, *, values_by_feature_uid: dict[str, Any] | None = None, ): from ..base.dtypes import is_iterable_of_sqlrecord from .can_curate import CanCurate host_is_record = self._host.__class__.__name__ == "Record" if host_is_record: feature_json_values: list[SQLRecord] = [] links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list) record_not_validated_values: dict[str, tuple[str, list[str]]] = {} self._collect_record_feature_writes( record=self._host, feature_objects=feature_objects, dictionary=dictionary, values_by_feature_uid=values_by_feature_uid, feature_json_values=feature_json_values, links_by_model=links_by_model, not_validated_values=record_not_validated_values, ) self._raise_not_validated_values(record_not_validated_values) if feature_json_values: save(feature_json_values) for links in links_by_model.values(): try: save(links, ignore_conflicts=False) except Exception: save(links, ignore_conflicts=True) return None features_labels = defaultdict(list) feature_json_values = [] not_validated_values: dict[str, tuple[str, list[str]]] = {} for feature in feature_objects: if ( values_by_feature_uid is not None and feature.uid in values_by_feature_uid ): value = values_by_feature_uid[feature.uid] else: value = dictionary[feature.name] if value is None: continue if not ( feature.dtype_as_str.startswith("cat") or feature.dtype_as_str.startswith("list[cat") ): _, converted_value, _ = infer_convert_dtype_key_value( key=feature.name, value=value, dtype_str=feature.dtype_as_str ) filter_kwargs = {"feature": feature, "value": converted_value} feature_value, _ = JsonValue.get_or_create(**filter_kwargs) feature_json_values.append(feature_value) else: if isinstance(value, SQLRecord) or is_iterable_of_sqlrecord(value): if isinstance(value, SQLRecord): label_records = [value] else: label_records = value # type: ignore for record in label_records: if record._state.adding: raise ValidationError( f"Please save {record} before annotation." ) features_labels[ record.__class__.__get_name_with_module__() ].append((feature, record)) else: if isinstance(value, str): values = [value] # type: ignore else: values = value # type: ignore if feature._dtype_str == "cat": new_dtype_str = feature._dtype_str + "[ULabel]" feature._dtype_str = new_dtype_str feature.save() result = { "registry_str": "ULabel", "registry": ULabel, "field": ULabel.name, } else: result = parse_dtype(feature._dtype_str)[0] if issubclass(result["registry"], CanCurate): # type: ignore validated = result["registry"].validate( # type: ignore values, field=result["field"], mute=True ) values_array = np.array(values) validated_values = values_array[validated] if validated.sum() != len(values): not_validated_values[result["registry_str"]] = ( # type: ignore result["field_str"], values_array[~validated].tolist(), ) label_records = result["registry"].from_values( # type: ignore validated_values, field=result["field"], mute=True ) else: label_records = result["registry"].filter( # type: ignore **{f"{result['field_str']}__in": values} ) if len(label_records) != len(values): raise ValidationError( f"Some of these values for {result['registry_str']} do not exist: {values}" ) features_labels[result["registry_str"]] += [ # type: ignore (feature, label_record) for label_record in label_records ] # TODO: given we had already validated prior to calling _add_values, this block below should never be reached # refactor this out if possible self._raise_not_validated_values(not_validated_values) if features_labels: self._add_label_feature_links(features_labels) if feature_json_values: to_insertjson_values = [ record for record in feature_json_values if record._state.adding ] if to_insertjson_values: save(to_insertjson_values) links = [ self._host.json_values.through( **{ f"{self._host.__class__.__name__.lower()}_id": self._host.id, "jsonvalue_id": json_value.id, } ) for json_value in feature_json_values ] # a link might already exist, hence ignore_conflicts is needed save(links, ignore_conflicts=True) def set_values( self, values: dict[str | Feature, Any], feature_field: FieldAttr = Feature.name, schema: Schema = None, ) -> None: """Set values for features. Note that, in the context of annotating an `Artifact`, this does **not** affect the annotations derived from the artifact's dataset features. It only sets the artifact's external feature annotations. Args: values: A dictionary of keys (features) & values (labels, strings, numbers, booleans, datetimes, etc.). Keys can be feature names (`str`) or `Feature` objects. If a value is `None`, it will be skipped. feature_field: The field of a registry to map the keys of the `values` dictionary in case strings are passed. schema: Schema to validate against. Examples: Here is how to annotate an artifact ad hoc:: artifact.features.set_values({ "species": "human", "scientist": ['Barbara McClintock', 'Edgar Anderson'], "temperature": 27.6, "experiment": "Experiment 1" }) Query artifacts by features:: ln.Artifact.filter(scientist="Barbara McClintock") If your feature names are ambiguous, you can use a `Feature` object to disambiguate:: temperature = ln.Feature.get(name="temperature", type__name="my_feature_type") # to set feature values artifact.features.set_values({temperature: 0.5}) # temperature is the feature object # to query by feature values ln.Artifact.filter(temperature == 0.5) # instead of temperature=0.5 You can pass a schema to validate the dictionary:: schema = ln.Schema([ln.Feature(name="species", dtype=str).save()]).save() artifact.features.set_values({"species": "bird"}, schema=schema) Also see :class:`lamindb.Artifact.features`, :class:`lamindb.Record.features`, and :class:`lamindb.Run.features`. """ from lamindb.curators.core import ExperimentalDictCurator host_is_record = self._host.__class__.__name__ == "Record" host_is_artifact = self._host.__class__.__name__ == "Artifact" # rename to distinguish from the values inside the dict ( dictionary, string_key_values, explicit_features, values_by_feature_uid, ) = self._resolve_feature_value_dictionary(values) keys = dictionary.keys() if isinstance(keys, DICT_KEYS_TYPE): keys = list(keys) # type: ignore if ( host_is_record and self._host.type is not None and self._host.type.schema is not None # type: ignore ): assert schema is None, "Cannot pass schema if record.type has schema." schema = self._host.type.schema # type: ignore if host_is_artifact: schema = self._get_external_schema() if schema is not None: ExperimentalDictCurator(dictionary, schema).validate() member_ids = set(schema.members.values_list("id", flat=True)) features_not_in_schema = [ feature.name for feature in explicit_features if feature.id not in member_ids ] if features_not_in_schema: raise ValidationError( "These feature keys are not in the provided schema: " f"{features_not_in_schema}" ) looked_up_features = schema.members.filter(name__in=keys) feature_objects = self._merge_feature_objects( explicit_features, looked_up_features ) else: if string_key_values: looked_up_features = self._get_feature_objects( string_key_values, feature_field ) else: looked_up_features = Feature.objects.none() feature_objects = self._merge_feature_objects( explicit_features, looked_up_features ) self._remove_values() self._add_values( feature_objects, dictionary=dictionary, values_by_feature_uid=values_by_feature_uid, ) def _get_external_schema(self) -> Schema | None: external_schema = None if self._host.otype is None: external_schema = self._host.schema elif self._host.schema is not None: external_schema = self._host.schema.slots.get("__external__", None) return external_schema def remove_values( self, feature: ( str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None ) = None, *, value: Any | None = None, ) -> None: """Remove values for features. Args: feature: Indicate one or several features for which to remove values. If `None`, values for all external features will be removed. Also supports a dictionary mapping feature keys to values to remove, e.g. `{feature: value}`. value: An optional value to restrict removal to a single value. """ host_name = self._host.__class__.__name__.lower() host_is_artifact = host_name == "artifact" if host_is_artifact: external_schema = self._get_external_schema() if external_schema is not None: raise ValueError( "Cannot remove values if artifact has external schema." ) return self._remove_values( feature, value=value, ) def _remove_values( self, feature: ( str | Feature | list[str | Feature] | dict[str | Feature, Any | None] | None ) = None, *, value: Any | None = None, ) -> None: from django.apps import apps host_name = self._host.__class__.__name__.lower() host_is_record = host_name == "record" host_is_artifact = host_name == "artifact" if isinstance(feature, dict): if value is not None: raise ValueError( "Pass either `value=` or per-feature values via a dictionary, not both." ) for one_feature, one_value in feature.items(): self._remove_values(one_feature, value=one_value) return if feature is None: features = get_features_data( self._host, to_dict=True, external_only=True ).keys() elif not isinstance(feature, list): features = [feature] else: features = feature for feature in features: if isinstance(feature, str): feature_record = Feature.get(name=feature) else: feature_record = feature if feature_record._state.adding: raise ValidationError( f"Please save feature '{feature_record.name}' before annotation." ) if ( self._host._state.db is not None and feature_record._state.db != self._host._state.db ): feature_record = Feature.connect(self._host._state.db).get( uid=feature_record.uid ) if host_is_artifact: for schema in self.slots.values(): if feature_record in schema.members: raise ValueError("Cannot remove values for dataset features.") filter_kwargs = {"feature": feature_record} none_message = f"with value {value!r} " if value is not None else "" if feature_record._dtype_str.startswith(("cat[", "list[cat")): # type: ignore feature_registry = parse_dtype(feature_record._dtype_str)[0][ "registry_str" ] if "." in feature_registry: parts = feature_registry.split(".") app_label = parts[0] entity_name = parts[-1] else: app_label = "lamindb" entity_name = feature_registry host_name = self._host.__class__.__name__ link_model_name = f"{host_name}{entity_name}" link_model = apps.get_model(app_label, link_model_name) filter_kwargs[host_name.lower()] = self._host if value is not None: if not isinstance(value, SQLRecord): raise TypeError( f"Expected a record for removing categorical feature value, " f"got {value} of type {type(value)}" ) assert not host_is_record, "Only artifacts support passing a value." filter_kwargs[entity_name.lower()] = value link_records = link_model.objects.filter(**filter_kwargs) if not link_records.exists(): value_msg = f"with value {value!r} " if value is not None else "" logger.warning( f"no feature '{feature_record.name}' {value_msg}found on " f"{host_name.lower()} '{self._host.uid}'!" ) return link_records.delete() else: if value is not None: filter_kwargs["value"] = value if host_is_record: feature_values = self._host.values_json.filter(**filter_kwargs) else: feature_values = self._host.json_values.filter(**filter_kwargs) if not feature_values.exists(): logger.warning( f"no feature '{feature_record.name}' {none_message}found on {self._host.__class__.__name__.lower()} '{self._host.uid}'!" ) return if host_is_record: feature_values.delete(permanent=True) else: # the below might leave a dangling feature_value record # but we don't want to pay the price of making another query just to remove this annotation # we can clean the JsonValue registry periodically if we want to self._host.json_values.remove(*feature_values) def _add_schema(self, schema: Schema, slot: str) -> None: """Annotate artifact with a schema. Args: schema: `Schema` A schema record. slot: `str` The slot that marks where the schema is stored in the artifact. """ # TODO: deprecate as soon as we have the Schema-based curators if self._host._state.adding: raise ValueError( "Please save the artifact or collection before adding a feature set!" ) host_db = self._host._state.db schema.save(using=host_db) kwargs = { "artifact_id": self._host.id, "schema": schema, "slot": slot, } link_record = ( self._host.schemas.through.objects.using(host_db) .filter(**kwargs) .one_or_none() ) if link_record is None: self._host.schemas.through(**kwargs).save(using=host_db) if slot in self.slots: logger.debug(f"replaced existing {slot} feature set") self._slots[slot] = schema # type: ignore def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None): """Transfer features from a artifact or collection.""" # This only covers feature sets if transfer_logs is None: transfer_logs = {"mapped": [], "transferred": [], "run": None} from lamindb import settings using_key = settings._using_key for slot, schema in data.features.slots.items(): # type: ignore try: members = schema.members except ModuleWasntConfigured as err: logger.warning(f"skipping transfer of {slot} schema because {err}") continue if len(members) == 0: continue if len(members) > settings.annotation.n_max_records: logger.warning( f"skipping creating {len(members)} > {settings.annotation.n_max_records} new {members[0].__class__.__name__} records" ) schema_self = schema schema_exists = Schema.filter(hash=schema_self.hash).one_or_none() if schema_exists is not None: schema_self = schema_exists else: schema_self.save() else: registry = members[0].__class__ # note here the features are transferred based on an unique field field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid") # this will be e.g. be a list of ontology_ids or uids member_uids = list(members.values_list(field, flat=True)) validated = registry.validate(member_uids, field=field, mute=True) new_members_uids = list(compress(member_uids, ~validated)) new_members = members.filter(**{f"{field}__in": new_members_uids}) n_new_members = len(new_members) if len(members) > settings.annotation.n_max_records: logger.warning( f"skipping creating {n_new_members} > {settings.annotation.n_max_records} new {registry.__name__} records" ) if n_new_members > 0: # transfer foreign keys needs to be run before transfer to default db transfer_fk_to_default_db_bulk( new_members, using_key, transfer_logs=transfer_logs ) for feature in new_members: # not calling save=True here as in labels, because want to # bulk save below # transfer_fk is set to False because they are already transferred # in the previous step transfer_fk_to_default_db_bulk transfer_to_default_db( feature, using_key, transfer_fk=False, transfer_logs=transfer_logs, ) save( new_members, ignore_conflicts=True ) # conflicts arising from existing records are ignored # create a new feature set from feature values using the same uid schema_self = Schema.from_values( member_uids, field=getattr(registry, field) ) if schema_self is None: if hasattr(registry, "organism_id"): logger.warning( f"Schema is not transferred, check if organism is set correctly: {schema}" ) continue # make sure the uid matches if schema is composed of same features if schema_self.hash == schema.hash: schema_self.uid = schema.uid logger.info(f"saving {slot} schema: {schema_self}") try: self._host.features._add_schema(schema_self, slot) except IntegrityError: logger.warning( f"updating annotation of artifact {self._host.uid} with feature set for slot: {slot}" ) self._host.schemas.through.objects.get( artifact_id=self._host.id, slot=slot ).delete() self._host.features._add_schema(schema_self, slot) def bulk_set_features_in_records(records: Iterable[Record]) -> None: """Bulk-set lazy feature dictionaries for records. Intended for records created via `Record(features=...)` and persisted with `ln.save([...])`. """ import pandas as pd from lamindb.curators.core import DataFrameCurator records_with_features = [ record for record in records if hasattr(record, "_features") and record._features is not None ] if len(records_with_features) == 0: return None batch_schema: Schema | None = None prepared_records: list[ tuple[Record, FeatureManager, dict[str, Any], list[Feature], dict[str, Any]] ] = [] prepared_rows: list[dict[str, Any]] = [] for record in records_with_features: schema = None if record.type is not None and record.type.schema is not None: schema = record.type.schema if schema is None: raise ValidationError( "Bulk setting features in records requires all records to have the same non-null type schema." ) if batch_schema is None: batch_schema = schema elif schema.id != batch_schema.id: raise ValidationError( "Bulk setting features in records requires all records to have the same type schema." ) manager = record.features ( dictionary, _, explicit_features, values_by_feature_uid, ) = manager._resolve_feature_value_dictionary(record._features) prepared_rows.append(dictionary) prepared_records.append( (record, manager, dictionary, explicit_features, values_by_feature_uid) ) assert batch_schema is not None # noqa: S101 schema_features = list(batch_schema.members.all()) dataframe = pd.DataFrame(prepared_rows) for feature in schema_features: if ( feature.name in dataframe and feature.dtype_as_str.startswith("cat") and not feature.dtype_as_str.startswith("list[cat") ): dataframe[feature.name] = dataframe[feature.name].astype("category") # Single-pass dataframe curation: # validate schema and resolve categoricals once for the entire batch. # # The resolved label records are then reused below when creating per-record # link rows, avoiding repeated registry calls for each row. curator = DataFrameCurator(dataframe, batch_schema) curator.validate() members_by_name: dict[str, list[Feature]] = defaultdict(list) schema_member_ids: set[int] = set() resolved_records_by_feature_id: dict[int, dict[Any, list[SQLRecord]]] = {} for feature in schema_features: members_by_name[feature.name].append(feature) schema_member_ids.add(feature.id) if not ( feature.dtype_as_str.startswith("cat") or feature.dtype_as_str.startswith("list[cat") ): continue cat_vector = curator.cat._cat_vectors.get(feature.name) if cat_vector is None or cat_vector.records is None: continue # Build lookup cache: # feature.id -> raw value -> [resolved label records] # # We intentionally keep a list of records per value to support # list-categorical and potential multi-match cases consistently with # existing link creation semantics. cache_for_feature: dict[Any, list[SQLRecord]] = defaultdict(list) for label_record in cat_vector.records: key = getattr(label_record, cat_vector._field_name) normalized_key = key.item() if isinstance(key, np.generic) else key cache_for_feature[normalized_key].append(label_record) resolved_records_by_feature_id[feature.id] = dict(cache_for_feature) feature_json_values: list[SQLRecord] = [] links_by_model: dict[type[SQLRecord], list[SQLRecord]] = defaultdict(list) not_validated_values: dict[str, tuple[str, list[str]]] = {} for ( record, manager, dictionary, explicit_features, values_by_feature_uid, ) in prepared_records: keys = list(dictionary.keys()) features_not_in_schema = [ feature.name for feature in explicit_features if feature.id not in schema_member_ids ] if features_not_in_schema: raise ValidationError( "These feature keys are not in the provided schema: " f"{features_not_in_schema}" ) looked_up_features = [ feature for key in keys for feature in members_by_name.get(key, []) ] feature_objects = manager._merge_feature_objects( explicit_features, looked_up_features ) manager._collect_record_feature_writes( record=record, feature_objects=feature_objects, dictionary=dictionary, values_by_feature_uid=values_by_feature_uid, feature_json_values=feature_json_values, links_by_model=links_by_model, not_validated_values=not_validated_values, resolved_records_by_feature_id=resolved_records_by_feature_id, ) FeatureManager._raise_not_validated_values(not_validated_values) if feature_json_values: save(feature_json_values) for links in links_by_model.values(): try: save(links, ignore_conflicts=False) except Exception: save(links, ignore_conflicts=True) for record in records_with_features: del record._features return None ================================================ FILE: lamindb/models/_from_values.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from lamin_utils import colors, logger if TYPE_CHECKING: from pandas import DataFrame, Index from lamindb.base.types import FieldAttr, ListLike from .query_set import SQLRecordList from .sqlrecord import SQLRecord # The base function for `from_values` def _from_values( iterable: ListLike, field: FieldAttr, *, create: bool = False, organism: SQLRecord | str | None = None, source: SQLRecord | None = None, standardize: bool = True, from_source: bool = True, mute: bool = False, **filter_kwargs, ) -> SQLRecordList: """Get or create records from iterables.""" from .query_set import SQLRecordList registry = field.field.model # type: ignore organism_record = get_organism_record_from_field(field, organism, values=iterable) # TODO: the create is problematic if field is not a name field if create: create_kwargs = {} if organism_record: create_kwargs["organism"] = organism_record return SQLRecordList( [ registry(**{field.field.name: value}, **create_kwargs) for value in iterable ] ) # type: ignore iterable_idx = index_iterable(iterable) # returns existing records & non-existing values records, nonexist_values, msg = get_existing_records( iterable_idx=iterable_idx, field=field, organism=organism_record, mute=mute, **filter_kwargs, ) # new records to be created based on new values if len(nonexist_values) > 0: if from_source and registry.__base__.__name__ == "BioRecord": # if can and needed, get organism record from the existing records if ( organism_record is None and len(records) > 0 and registry.require_organism() ): organism_record = records[0].organism records_public, unmapped_values = create_records_from_source( iterable_idx=nonexist_values, field=field, organism=organism_record, source=source, standardize=standardize, msg=msg, mute=mute, ) if len(records_public) > 0: msg = "" for record in records_public: record._from_source = True records += records_public else: unmapped_values = nonexist_values # unmapped new_ids will NOT create records if len(unmapped_values) > 0: # first log the success message if len(msg) > 0 and not mute: logger.success(msg) s = "" if len(unmapped_values) == 1 else "s" print_values = colors.yellow(_format_values(unmapped_values)) n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated") if not mute: logger.info( f"{colors.red('did not create')} {registry.__name__} record{s} for " f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}" # type: ignore ) return SQLRecordList(records) def get_existing_records( iterable_idx: Index, field: FieldAttr, organism: SQLRecord | None = None, standardize: bool = True, mute: bool = False, **filter_kwargs, ) -> tuple[list, Index, str]: """Get existing records from the database.""" import pandas as pd from .can_curate import _validate # NOTE: existing records matching is agnostic to the source registry = field.field.model # type: ignore queryset = registry.filter(**filter_kwargs) if standardize: # log synonyms mapped terms if hasattr(registry, "standardize"): syn_mapper = queryset.standardize( iterable_idx, field=field, organism=organism, mute=True, from_source=False, # standardize only based on the DB reference return_mapper=True, ) iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index else: syn_mapper = {} # now we have to sort the list of queried records # preserved = Case( # *[ # When(**{field.field.name: value}, then=pos) # for pos, value in enumerate(iterable_idx) # ] # ) # order by causes a factor 10 in runtime # records = query_set.order_by(preserved).to_list() # log validated terms is_validated = _validate( cls=queryset, values=iterable_idx, field=field, organism=organism, mute=True ) if len(is_validated) > 0: validated = iterable_idx[is_validated] else: validated = [] msg = "" syn_msg = "" if not mute: if len(validated) > 0: s = "" if len(validated) == 1 else "s" print_values = colors.green(_format_values(validated)) msg = ( "loaded" f" {colors.green(f'{len(validated)} {registry.__name__} record{s}')}" f" matching {colors.italic(f'{field.field.name}')}: {print_values}" ) if len(syn_mapper) > 0: s = "" if len(syn_mapper) == 1 else "s" names = list(syn_mapper.keys()) print_values = colors.green(_format_values(names)) syn_msg = ( "loaded" f" {colors.green(f'{len(syn_mapper)} {registry.__name__} record{s}')}" f" matching {colors.italic('synonyms')}: {print_values}" ) # no logging if all values are validated # logs if there are synonyms if len(syn_msg) > 0: if len(msg) > 0 and not mute: logger.success(msg) if not mute: logger.success(syn_msg) msg = "" # get all existing records in the db query = {f"{field.field.name}__in": iterable_idx.values} # type: ignore if organism is not None: query["organism"] = organism records = queryset.filter(**query).to_list() if len(validated) == len(iterable_idx): return records, pd.Index([]), msg else: nonval_values = iterable_idx.difference(validated) return records, nonval_values, msg def create_records_from_source( iterable_idx: Index, field: FieldAttr, organism: SQLRecord | None = None, source: SQLRecord | None = None, standardize: bool = True, msg: str = "", mute: bool = False, ) -> tuple[list, Index]: """Create records from source.""" registry = field.field.model # type: ignore records: list = [] # populate additional fields from public_df from bionty._organism import OrganismNotSet from bionty._source import filter_public_df_columns, get_source_record # get the default source if organism is None and registry.require_organism(field=field): raise OrganismNotSet( f"`organism` is required to create new {registry.__name__} records from source!" ) try: source_record = get_source_record(registry, organism, source) except ValueError: # no source found return records, iterable_idx # create the corresponding PublicOntology object from registry try: public_ontology = registry.public(source=source_record) except Exception: # no public source return records, iterable_idx # filter the columns in public df based on fields public_df = filter_public_df_columns( registry=registry, public_ontology=public_ontology ) if public_df.empty: return records, iterable_idx # standardize in the public reference # do not inspect synonyms if the field is not name field result = public_ontology.inspect( iterable_idx, field=field.field.name, # type: ignore standardize=False if hasattr(registry, "_name_field") and field.field.name != registry._name_field else standardize, # type: ignore mute=True, ) syn_mapper = result.synonyms_mapper msg_syn: str = "" if len(syn_mapper) > 0: s = "" if len(syn_mapper) == 1 else "s" names = list(syn_mapper.keys()) print_values = colors.purple(_format_values(names)) msg_syn = ( "created" f" {colors.purple(f'{len(syn_mapper)} {registry.__name__} record{s} from Bionty')}" f" matching {colors.italic('synonyms')}: {print_values}" ) iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index # create records for values that are found in the public reference # matching either field or synonyms mapped_values = iterable_idx.intersection(public_df[field.field.name]) # type: ignore multi_msg = "" if len(mapped_values) > 0: public_kwargs, multi_msg = _bulk_create_dicts_from_df( keys=mapped_values, column_name=field.field.name, # type: ignore df=public_df, ) create_kwargs = ( {"organism": organism, "source": source_record} if organism is not None else {"source": source_record} ) for bk in public_kwargs: # skip validation to speed up bulk creation since the values don't validate in the registry DB yet records.append(registry(**bk, **create_kwargs, _skip_validation=True)) # number of records that matches field (not synonyms) validated = result.validated if len(validated) > 0: s = "" if len(validated) == 1 else "s" print_values = colors.purple(_format_values(validated)) # this is the success msg for existing records in the DB from get_existing_records if len(msg) > 0 and not mute: logger.success(msg) if not mute: logger.success( "created" f" {colors.purple(f'{len(validated)} {registry.__name__} record{s} from Bionty')}" f" matching {colors.italic(f'{field.field.name}')}: {print_values}" # type: ignore ) # make sure that synonyms logging appears after the field logging if len(msg_syn) > 0 and not mute: logger.success(msg_syn) # warning about multi matches if len(multi_msg) > 0 and not mute: logger.warning(multi_msg) # return the values that are not found in the public reference unmapped_values = iterable_idx.difference(mapped_values) return records, unmapped_values def index_iterable(iterable: ListLike) -> Index: """Get unique values from an iterable.""" import pandas as pd idx = pd.Index(iterable).unique() # No entries are made for NAs, '', None # returns an ordered unique not null list return idx[(idx != "") & (~idx.isnull())] def _format_values( names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'" ) -> str: """Format values for printing.""" items = {str(name): None for name in names if name != "None"} unique_items = list(items.keys()) if quotes: unique_items = [f"{sep}{item}{sep}" for item in unique_items] print_values = ", ".join(unique_items[:n]) if len(unique_items) > n: print_values += ", ..." return print_values def _bulk_create_dicts_from_df( keys: set | list, column_name: str, df: DataFrame ) -> tuple[dict, str]: """Get fields from a DataFrame for many rows.""" multi_msg = "" if df.index.name != column_name: df = df.set_index(column_name).loc[list(keys)] if not df.index.is_unique: # return all records for multi-matches with a warning dup = df.index[df.index.duplicated()].unique().tolist() if len(dup) > 0: s = "" if len(dup) == 1 else "s" print_values = _format_values(dup) multi_msg = ( f"ambiguous validation in Bionty for {len(dup)} record{s}:" f" {print_values}" ) return df.reset_index().to_dict(orient="records"), multi_msg def get_organism_record_from_field( # type: ignore field: FieldAttr, organism: str | SQLRecord | None = None, values: ListLike = None, using_key: str | None = None, ) -> SQLRecord | None: """Get organism record based on which field is used in from_values. Args: field: the field of the registry for from_values organism: the organism to get the organism record for values: the values passed to from_values using_key: the db to get the organism record from Returns: The organism record if both conditions are met: The organism FK is required for the registry The field is not unique (e.g. Gene.symbol) or the organism is not None """ registry = field.field.model if registry.__base__.__name__ != "BioRecord": return None from bionty._organism import ( create_or_get_organism_record, infer_organism_from_ensembl_id, ) if values is None: values = [] # if the field is bionty.Gene.ensembl_gene_id, infer organism from ensembl id if ( registry.__get_name_with_module__() == "bionty.Gene" and field.field.name == "ensembl_gene_id" and len(values) > 0 and organism is None ): # Check if values contain bionty.Gene objects with organism field from collections.abc import Iterable # first check if we have Gene objects for v in values: # early return to not loop through all values to find a string if isinstance(v, str): break if isinstance(v, registry) and v.organism is not None: return v.organism # Handle iterables containing Gene objects (but not strings, which are also iterable) elif isinstance(v, Iterable) and not isinstance(v, str): for item in v: if isinstance(item, registry) and item.organism is not None: return item.organism # If no bionty.Gene with organism found, fall back to string-based inference # pass the first ensembl id that starts with ENS to infer organism first_ensembl = next( (v for v in values if isinstance(v, str) and v.startswith("ENS")), "" ) if first_ensembl: return infer_organism_from_ensembl_id(first_ensembl, using_key) return create_or_get_organism_record( organism=organism, registry=registry, field=field ) ================================================ FILE: lamindb/models/_is_versioned.py ================================================ from __future__ import annotations from pathlib import PurePosixPath from typing import TYPE_CHECKING, Any, Iterable, Literal from django.db import models from django.db.models import Q from lamin_utils import logger from lamin_utils._base62 import increment_base62 from lamindb.base import uids from lamindb.base.fields import ( BooleanField, CharField, ) if TYPE_CHECKING: # noqa from lamindb.models.query_set import QuerySet class IsVersioned(models.Model): """Base class for versioned models.""" class Meta: abstract = True _len_stem_uid: int version_tag: str | None = CharField(max_length=30, null=True, db_index=True) """Version tag (default `None`). Consider using `semantic versioning `__ with `Python versioning `__. """ is_latest: bool = BooleanField(default=True, db_index=True) """Boolean flag that indicates whether a record is the latest in its version family.""" def __init__( self, *args, **kwargs, ): self._revises = kwargs.pop("revises", None) super().__init__(*args, **kwargs) @property def stem_uid(self) -> str: """Universal id characterizing the version family. The full uid of a record is obtained via concatenating the stem uid and version information:: stem_uid = random_base62(n_char) # a random base62 sequence of length 12 (transform) or 16 (artifact, collection) version_uid = "0000" # an auto-incrementing 4-digit base62 number uid = f"{stem_uid}{version_uid}" # concatenate the stem_uid & version_uid """ return self.uid[: self._len_stem_uid] # type: ignore @property def version(self) -> str: """The version of an object. Defines version of an object within a family of objects characterized by the same `stem_uid`. Returns `.version_tag` if set, otherwise the last 4 characters of the `uid`. """ return self.version_tag if self.version_tag else self.uid[-4:] # type: ignore @version.setter def version(self, value: str | None) -> None: self.version_tag = value @property def versions(self) -> QuerySet: """Lists all records of the same version family. Example:: artifact.versions.to_dataframe() # all versions of the artifact in a dataframe artifact.versions.get(is_latest=True) # the latest version of the artifact """ return ( self.__class__.connect(self._state.db) .filter(uid__startswith=self.stem_uid) .order_by("-created_at") ) def _add_to_version_family( self, revises: IsVersioned, version_tag: str | None = None ): """Add current record to a version family. Args: revises: a record that belongs to the version family. version_tag: semantic version tag of the record. """ old_uid = self.uid # type: ignore new_uid, revises = create_uid(revises=revises, version_tag=version_tag) if ( self.__class__.__name__ == "Artifact" and self._real_key is None and (self._key_is_virtual or self.key is None) ): from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid old_path = self.path new_storage_key = auto_storage_key_from_artifact_uid( new_uid, self.suffix, self._overwrite_versions ) new_path = old_path.rename( old_path.with_name(PurePosixPath(new_storage_key).name) ) logger.success(f"updated path from {old_path} to {new_path}!") self.uid = new_uid self.version_tag = version_tag self.save() logger.success(f"updated uid from {old_uid} to {new_uid}!") def bump_version( version: str, bump_type: str = "minor", behavior: Literal["prompt", "error", "ignore"] = "error", ) -> str: """Bumps the version number by major or minor depending on the bump_type flag. Args: version: The current version in "MAJOR" or "MAJOR.MINOR" format. bump_type: The type of version bump, either 'major' or 'minor'. Returns: The new version string. """ try: # Split the version into major and minor parts if possible parts = version.split(".") major = int(parts[0]) minor = int(parts[1]) if len(parts) > 1 else 0 if bump_type == "major": # Bump the major version and reset the minor version new_version = f"{major + 1}" elif bump_type == "minor": # Bump the minor version new_version = f"{major}.{minor + 1}" else: raise ValueError("bump_type must be 'major' or 'minor'") except (ValueError, IndexError): if behavior == "prompt": new_version = input( f"The current version is '{version}' - please type the new version: " ) elif behavior == "error": raise ValueError( "Cannot auto-increment non-integer castable version, please provide" " manually" ) from None else: logger.warning("could not auto-increment version, fix '?' manually") new_version = "?" return new_version def set_version(version: str | None = None, previous_version: str | None = None): """(Auto-) set version. If `version` is `None`, returns the stored version. Otherwise sets the version to the passed version. Args: version: Version string. previous_version: Previous version string. """ if version is None and previous_version is not None: version = bump_version(previous_version, bump_type="major") return version def create_uid( *, version_tag: str | None = None, n_full_id: int = 20, revises: IsVersioned | None = None, ) -> tuple[str, IsVersioned | None]: """This also updates revises in case it's not the latest version. This is why it returns revises. """ if revises is not None: latest_in_family = ( revises.__class__.objects.filter(uid__startswith=revises.stem_uid) .order_by("uid") .last() ) if latest_in_family is not None and latest_in_family.uid != revises.uid: revises = latest_in_family logger.warning( f"didn't pass the latest version in `revises`, retrieved it: {revises}" ) suid = revises.stem_uid vuid = increment_base62(revises.uid[-4:]) # type: ignore else: suid = uids.base62(n_full_id - 4) vuid = "0000" if version_tag is not None: if not isinstance(version_tag, str): raise ValueError( "`version` parameter must be `None` or `str`, e.g., '0.1', '1', '2', etc." ) if revises is not None: if version_tag == revises.version_tag: raise ValueError( f"Please change the version tag or leave it `None`, '{revises.version_tag}' is already taken" ) return suid + vuid, revises def process_revises( revises: IsVersioned | None, version_tag: str | None, key: str | None, description: str | None, type: type[IsVersioned], ) -> tuple[str, str, str, str, IsVersioned | None]: if revises is not None and not isinstance(revises, type): raise TypeError(f"`revises` has to be of type `{type.__name__}`") uid, revises = create_uid( revises=revises, version_tag=version_tag, n_full_id=type._len_full_uid ) if revises is not None: if description is None: description = getattr(revises, "description", None) if key is None: key = revises.key return uid, version_tag, key, description, revises def _adjust_is_latest_when_deleting_is_versioned( objects: IsVersioned | Iterable[IsVersioned], ) -> list[int]: """After deleting (soft or permanent) versioned records, promote new latest per version family. Accepts a single IsVersioned instance, a QuerySet, or a list of IsVersioned. Runs in 1 query (candidates + update) when objects are passed; no extra query for uids. Returns the list of pks that were promoted to is_latest (for testing). """ if isinstance(objects, IsVersioned): objects = [objects] else: objects = list(objects) if not objects: return [] id_list = [o.pk for o in objects] stem_uids = list({o.uid[: o._len_stem_uid] for o in objects if o.is_latest}) if not stem_uids: return [] registry = type(objects[0]) db = getattr(objects[0]._state, "db", None) or "default" len_stem = registry._len_stem_uid # All candidates: same family as any stem_uid, not in trash and not about to be deleted q = Q() for s in stem_uids: q |= Q(uid__startswith=s) qs = registry.objects.using(db).filter(q).exclude(pk__in=id_list) from .sqlrecord import SQLRecord if issubclass(registry, SQLRecord): qs = qs.exclude(branch_id=-1) candidates = list(qs.values("pk", "uid", "created_at")) # per stem_uid, pick candidate with max created_at by_stem: dict[str, dict[str, Any]] = {} for c in candidates: stem = c["uid"][:len_stem] if stem not in by_stem or c["created_at"] > by_stem[stem]["created_at"]: by_stem[stem] = c if not by_stem: return [] pks = [by_stem[s]["pk"] for s in by_stem] registry.objects.using(db).filter(pk__in=pks).update(is_latest=True) if pks: promoted_uids = [by_stem[s]["uid"] for s in by_stem] if len(promoted_uids) == 1: logger.important_hint( f"new latest {registry.__name__} version is: {promoted_uids[0]}" ) else: logger.important_hint( f"new latest {registry.__name__} versions: {promoted_uids}" ) return pks def reconcile_is_latest_within_branch( registry: type[IsVersioned], *, branch_id: int, db: str = "default", ) -> int: """Keep a single is_latest=True per version family in a branch. Winner selection is based on newest created_at, tie-broken by highest pk. Returns the number of records demoted from is_latest=True to False. """ len_stem = registry._len_stem_uid latest_records = list( registry.objects.using(db) .filter(branch_id=branch_id, is_latest=True) .values("pk", "uid", "created_at") .order_by("uid", "created_at", "pk") ) if not latest_records: return 0 winners_by_stem: dict[str, dict[str, Any]] = {} losers: list[int] = [] for record in latest_records: stem = record["uid"][:len_stem] winner = winners_by_stem.get(stem) if winner is None: winners_by_stem[stem] = record continue if (record["created_at"], record["pk"]) > (winner["created_at"], winner["pk"]): losers.append(winner["pk"]) winners_by_stem[stem] = record else: losers.append(record["pk"]) if not losers: return 0 return registry.objects.using(db).filter(pk__in=losers).update(is_latest=False) ================================================ FILE: lamindb/models/_label_manager.py ================================================ from __future__ import annotations from collections import defaultdict from typing import TYPE_CHECKING from django.db import connections from rich.table import Column, Table from rich.text import Text from rich.tree import Tree from lamindb.models import CanCurate, Feature from lamindb.models._from_values import _format_values from lamindb.models.save import save from lamindb.models.sqlrecord import ( REGISTRY_UNIQUE_FIELD, get_name_field, transfer_fk_to_default_db_bulk, transfer_to_default_db, ) from ._describe import ( NAME_WIDTH, TYPE_WIDTH, VALUES_WIDTH, format_rich_tree, ) from ._django import get_artifact_or_run_with_related, get_related_model from ._relations import dict_related_model_to_related_name if TYPE_CHECKING: from lamindb.models import Artifact, Collection, SQLRecord from lamindb.models.query_set import QuerySet EXCLUDE_LABELS = {"schemas"} def _get_labels( obj, links: bool = False, instance: str | None = None ) -> dict[str, QuerySet]: """Get all labels associated with an object as a dictionary. This is a generic approach that uses django orm. """ if obj.id is None: return {} labels = {} related_models = dict_related_model_to_related_name( obj.__class__, links=links, instance=instance ) if obj.__class__.__name__ == "Artifact" and links: related_models["ArtifactArtifact"] = "links_artifact" for _, related_name in related_models.items(): if ( related_name not in EXCLUDE_LABELS and not related_name.startswith("_") and not related_name == "json_values" ): labels[related_name] = getattr(obj, related_name).all() return labels def _get_labels_postgres( self: Artifact | Collection, m2m_data: dict | None = None ) -> dict[str, dict[int, str]]: """Get all labels associated with an artifact or collection as a dictionary. This is a postgres-specific approach that uses django Subquery. """ if m2m_data is None: artifact_meta = get_artifact_or_run_with_related(self, include_m2m=True) m2m_data = artifact_meta.get("related_data", {}).get("m2m", {}) return m2m_data def describe_labels( self: Artifact | Collection, related_data: dict | None = None, ) -> Tree | None: """Describe labels.""" labels_data = related_data.get("m2m") if related_data is not None else None if labels_data is None: if ( not self._state.adding and connections[self._state.db].vendor == "postgresql" ): labels_data = _get_labels_postgres(self, labels_data) if not labels_data: labels_data = _get_labels(self, instance=self._state.db) if not labels_data: return None labels_table = Table( Column("", style="", no_wrap=True, width=NAME_WIDTH), Column("", style="dim", no_wrap=True, width=TYPE_WIDTH), Column("", width=VALUES_WIDTH, no_wrap=True), show_header=False, box=None, pad_edge=False, ) for related_name, labels in labels_data.items(): if not labels or related_name == "schemas": continue if isinstance(labels, dict): displays = [ d[key] for d in labels.values() for key in d.keys() if key.endswith("_display") ] print_values = _format_values(displays, n=10, quotes=False) else: # labels are a QuerySet field = get_name_field(labels) print_values = _format_values( labels.values_list(field, flat=True), n=10, quotes=False ) if print_values: related_model = get_related_model(self, related_name) type_str = related_model.__get_name_with_module__() labels_table.add_row( f".{related_name}", Text(type_str, style="dim"), print_values ) tree = None if labels_table.rows: # we might not have rows even if labels_data was non-empty tree = Tree(Text("Labels", style="bold green_yellow"), guide_style="dim") tree.add(labels_table) return tree def _save_validated_records( labels: QuerySet | list | dict, ) -> list[str]: """Save validated records from public based on ontology_id_fields.""" if not labels: return [] registry = labels[0].__class__ field = ( REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid") if not hasattr(registry, "_ontology_id_field") else registry._ontology_id_field ) # if the field value is None, use uid field label_uids = [getattr(label, field) for label in labels if label is not None] # save labels from ontology_ids if hasattr(registry, "_ontology_id_field") and label_uids: try: records = registry.from_values(label_uids, field=field, mute=True) save([r for r in records if r._state.adding]) except Exception: # noqa: S110 pass field = "uid" label_uids = [label.uid for label in labels if label is not None] if issubclass(registry, CanCurate): validated = registry.validate(label_uids, field=field, mute=True) new_labels = [ label for label, is_valid in zip(labels, validated) if not is_valid ] return new_labels return list(labels) def save_validated_records( records: QuerySet | list | dict, ) -> list[str] | dict[str, list[str]]: """Save validated records from public based on ontology_id_fields.""" if isinstance(records, dict): return { registry: _save_validated_records(registry_records) for registry, registry_records in records.items() } return _save_validated_records(records) class LabelManager: """Label manager. This allows to manage untyped labels :class:`~lamindb.ULabel` and arbitrary typed labels (e.g., :class:`~bionty.CellLine`) and associate labels with features. """ def __init__(self, sqlrecord: Artifact | Collection) -> None: # host is the sqlrecord that the label manager is attached to # we might rename _host to _sqlrecord in the future self._host = sqlrecord def __repr__(self) -> str: return self.describe(return_str=True) def describe(self, return_str=True) -> str: """Describe the labels.""" tree = describe_labels(self._host) return format_rich_tree(tree, return_str=return_str) def add( self, records: SQLRecord | list[SQLRecord] | QuerySet, feature: Feature | None = None, ) -> None: """Add one or several labels and associate them with a feature. Args: records: Label records to add. feature: Feature under which to group the labels. """ from .artifact import add_labels return add_labels(self._host, records=records, feature=feature) def get( self, feature: Feature, mute: bool = False, flat_names: bool = False, ) -> QuerySet | dict[str, QuerySet] | list: """Get labels given a feature. Args: feature: Feature under which labels are grouped. mute: Show no logging. flat_names: Flatten list to names rather than returning records. """ from .artifact import get_labels return get_labels(self._host, feature=feature, mute=mute, flat_names=flat_names) def add_from(self, data: Artifact | Collection, transfer_logs: dict = None) -> None: """Add labels from an artifact or collection to another artifact or collection. Examples: :: artifact1 = ln.Artifact(pd.DataFrame(index=[0, 1])).save() artifact2 = ln.Artifact(pd.DataFrame(index=[2, 3])).save() records = ln.ULabel.from_values(["Label1", "Label2"], field="name").save() labels = ln.ULabel.filter(name__icontains = "label") artifact1.ulabels.set(labels) # using the ManyToMany relationship `.ulabels` artifact2.labels.add_from(artifact1) # using the `.labels` accessor that understands any label type """ if transfer_logs is None: transfer_logs = {"mapped": [], "transferred": [], "run": None} from lamindb import settings using_key = settings._using_key for related_name, labels in _get_labels(data, instance=data._state.db).items(): labels = labels.all() if not labels.exists(): continue # look for features data_name_lower = data.__class__.__name__.lower() labels_by_features: dict = defaultdict(list) features = set() new_labels = save_validated_records(labels) if len(new_labels) > 0: transfer_fk_to_default_db_bulk( new_labels, using_key, transfer_logs=transfer_logs ) for label in labels: keys: list = [] # if the link table doesn't follow this convention, we'll ignore it if not hasattr(label, f"links_{data_name_lower}"): key = None keys.append(key) else: links = getattr(label, f"links_{data_name_lower}").filter( **{f"{data_name_lower}_id": data.id} ) for link in links: if link.feature is not None: features.add(link.feature) key = link.feature.uid else: key = None keys.append(key) label_returned = transfer_to_default_db( label, using_key, transfer_logs=transfer_logs, transfer_fk=False, save=True, ) # TODO: refactor return value of transfer to default db if label_returned is not None: label = label_returned for key in keys: labels_by_features[key].append(label) # treat features new_features = save_validated_records(list(features)) if len(new_features) > 0: transfer_fk_to_default_db_bulk( new_features, using_key, transfer_logs=transfer_logs ) for feature in new_features: transfer_to_default_db( feature, # type: ignore using_key, transfer_logs=transfer_logs, transfer_fk=False, ) save(new_features) # type: ignore if hasattr(self._host, related_name): for feature_uid, feature_labels in labels_by_features.items(): if feature_uid is not None: feature_id = Feature.get(feature_uid).id else: feature_id = None getattr(self._host, related_name).add( *feature_labels, through_defaults={"feature_id": feature_id} ) ================================================ FILE: lamindb/models/_relations.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING import lamindb_setup as ln_setup from django.db.models import ManyToManyField from lamindb_setup._connect_instance import ( get_owner_name_from_identifier, load_instance_settings, ) from lamindb_setup.core._settings_store import instance_settings_file from lamindb.models.sqlrecord import IsLink if TYPE_CHECKING: from lamindb.models.sqlrecord import Registry, SQLRecord def get_schema_modules(instance: str | None) -> set[str]: if instance is None or instance == "default": schema_modules = set(ln_setup.settings.instance.modules) schema_modules.add("core") return schema_modules owner, name = get_owner_name_from_identifier(instance) settings_file = instance_settings_file(name, owner) if settings_file.exists(): modules = set(load_instance_settings(settings_file).modules) else: cache_filepath = ( ln_setup.settings.cache_dir / f"instance--{owner}--{name}--uid.txt" ) if cache_filepath.exists(): modules = set(cache_filepath.read_text().split("\n")[1].split(",")) else: raise ValueError(f"Instance {instance} not found") shared_schema_modules = set(ln_setup.settings.instance.modules).intersection( modules ) shared_schema_modules.add("core") return shared_schema_modules # this function here should likely be renamed # it maps the __get_name_with_module__() onto the actual model def dict_module_name_to_model_name( registry: Registry, instance: str | None = None ) -> dict[str, Registry]: schema_modules = get_schema_modules(instance) d: dict = { i.related_model.__get_name_with_module__(): i.related_model for i in registry._meta.related_objects if i.related_name is not None and i.related_model.__get_module_name__() in schema_modules } d.update( { i.related_model.__get_name_with_module__(): i.related_model for i in registry._meta.many_to_many if i.name is not None and i.related_model.__get_module_name__() in schema_modules } ) return d def dict_related_model_to_related_name( registry: type[SQLRecord], links: bool = False, instance: str | None = None ) -> dict[str, str]: def include(model: SQLRecord): return not links != issubclass(model, IsLink) schema_modules = get_schema_modules(instance) related_objects = registry._meta.related_objects + registry._meta.many_to_many d: dict = { record.related_model.__get_name_with_module__(): ( record.related_name if not isinstance(record, ManyToManyField) else record.name ) for record in related_objects if ( record.name is not None and include(record.related_model) and record.related_model.__get_module_name__() in schema_modules and not ( ( record.related_name if not isinstance(record, ManyToManyField) else record.name ).startswith("linked_in_") ) ) } if "RecordRecord" in d: d["RecordRecord"] = "values_record" return d def get_related_name(features_type: type[SQLRecord]) -> str: from lamindb.models.schema import Schema candidates = [ field.related_name for field in Schema._meta.related_objects if field.related_model == features_type ] if not candidates: raise ValueError( f"Can't create feature sets from {features_type.__name__} because it's not" " related to it!\nYou need to create a link model between Schema and" " your SQLRecord in your custom module.\nTo do so, add a" " line:\n_schemas = models.ManyToMany(Schema," " related_name='mythings')\n" ) return candidates[0] ================================================ FILE: lamindb/models/_run_cleanup.py ================================================ """Background cleanup of report/environment artifacts after Run bulk delete. Runnable as: python -m lamindb.models._run_cleanup --instance owner/name --ids 1,2,3 [--run-uid UID] """ import argparse import logging from lamin_utils import logger import lamindb as ln def main() -> None: parser = argparse.ArgumentParser(description="Clean up orphaned run artifacts.") parser.add_argument("--instance", required=True, help="Instance slug (owner/name).") parser.add_argument("--ids", required=True, help="Comma-separated artifact IDs.") parser.add_argument( "--run-uid", required=True, help="Run UID for log file name (run_cleanup_logs_{uid}.txt in cache dir).", ) args = parser.parse_args() ln.connect(args.instance) file_handler = None log_path = ln.setup.settings.cache_dir / f"run_cleanup_logs_{args.run_uid}.txt" file_handler = logging.FileHandler(log_path, mode="a") logger.addHandler(file_handler) for aid_str in args.ids.split(","): aid = int(aid_str.strip()) artifact = ln.Artifact.objects.filter(id=aid).first() if artifact is not None: assert artifact.kind == "__lamindb_run__", ( f"artifact {artifact.uid} is not of __lamindb_run__ kind, aborting cleanup of artifacts {args.ids}" ) try: artifact.delete(permanent=True) logger.important(f"deleted artifact {aid}") except Exception as e: logger.error(f"did not delete artifact {aid}: {e}") pass if __name__ == "__main__": main() ================================================ FILE: lamindb/models/artifact.py ================================================ # ruff: noqa: TC004 from __future__ import annotations import shutil import types import warnings from collections import defaultdict from pathlib import Path, PurePath, PurePosixPath from typing import TYPE_CHECKING, Any, Iterator, Literal, TypeVar, Union, overload import fsspec import lamindb_setup as ln_setup from django.db import ProgrammingError, models from django.db.models import CASCADE, PROTECT, Q from django.db.models.functions import Length from lamin_utils import colors, logger from lamindb_setup import settings as setup_settings from lamindb_setup.core._hub_core import select_storage_or_parent from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file from lamindb_setup.core.upath import ( LocalPathClasses, UPath, create_path, extract_suffix_from_path, fs_for_moving, get_stat_dir_cloud, get_stat_file_cloud, ) from ..base.fields import ( BigIntegerField, BooleanField, CharField, ForeignKey, TextField, ) from ..base.users import current_user_id from ..base.utils import deprecated, strict_classmethod from ..core._compat import with_package_obj from ..core._settings import settings from ..errors import ( FieldValidationError, InvalidArgument, NoStorageLocationForSpace, NoWriteAccess, UnknownStorageLocation, ValidationError, ) from ._feature_manager import ( FeatureManager, get_label_links, ) from ._is_versioned import ( IsVersioned, create_uid, ) from ._relations import ( dict_module_name_to_model_name, dict_related_model_to_related_name, ) from .feature import Feature, JsonValue from .has_parents import view_lineage from .query_set import QuerySet, SQLRecordList from .run import Run, TracksRun, TracksUpdates, User from .save import check_and_attempt_clearing, check_and_attempt_upload from .schema import Schema from .sqlrecord import ( BaseSQLRecord, Branch, IsLink, Space, SQLRecord, _get_record_kwargs, ) from .storage import Storage from .ulabel import ULabel def _lazy_load_storage_module(): """Lazy-import storage to avoid loading pandas/anndata at package import.""" from ..core.storage import ( delete_storage, infer_suffix, write_to_disk, ) from ..core.storage.paths import ( AUTO_KEY_PREFIX, auto_storage_key_from_artifact, auto_storage_key_from_artifact_uid, check_path_is_child_of_root, filepath_cache_key_from_artifact, filepath_from_artifact, ) return types.SimpleNamespace( delete_storage=delete_storage, infer_suffix=infer_suffix, write_to_disk=write_to_disk, AUTO_KEY_PREFIX=AUTO_KEY_PREFIX, auto_storage_key_from_artifact=auto_storage_key_from_artifact, auto_storage_key_from_artifact_uid=auto_storage_key_from_artifact_uid, check_path_is_child_of_root=check_path_is_child_of_root, filepath_cache_key_from_artifact=filepath_cache_key_from_artifact, filepath_from_artifact=filepath_from_artifact, ) # Cache the storage utils on first use _storage_cache: object | None = None # refactor this module to group logic that needs storage access in a class # in the future; then we don't need _s() anymore def _s(): global _storage_cache if _storage_cache is None: _storage_cache = _lazy_load_storage_module() return _storage_cache WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run" WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run" def _identify_zarr_type(storepath, *, check: bool = True): """Lazy-import to avoid loading storage at package import.""" try: from ..core.storage._zarr import identify_zarr_type return identify_zarr_type(storepath, check=check) except ImportError: raise ImportError("Please install zarr: pip install 'lamindb[zarr]'") from None if TYPE_CHECKING: from collections.abc import Iterable import pandas as pd from anndata import AnnData from fsspec import AbstractFileSystem from lamindb_setup.types import AnyPathStr from mudata import MuData # noqa: TC004 from polars import LazyFrame as PolarsLazyFrame from pyarrow.dataset import Dataset as PyArrowDataset from spatialdata import SpatialData # noqa: TC004 from tiledbsoma import Collection as SOMACollection from tiledbsoma import Experiment as SOMAExperiment from tiledbsoma import Measurement as SOMAMeasurement from ..base.types import ( ArtifactKind, StrField, ) from ..core.storage._backed_access import ( AnnDataAccessor, BackedAccessor, SpatialDataAccessor, ) from ..core.storage.types import ScverseDataStructures from ._label_manager import LabelManager from .block import ArtifactBlock from .collection import Collection from .project import Project, Reference from .query_manager import RelatedManager from .record import Record from .transform import Transform OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG = ( "Cannot read this outdated artifact version: " "its files were overwritten and are no longer available.\n" "Read from the latest version: artifact.versions.get(is_latest=True)" ) def process_pathlike( filepath: UPath, storage: Storage, using_key: str | None, skip_existence_check: bool = False, ) -> tuple[Storage, bool]: """Determines the appropriate storage for a given path and whether to use an existing storage key.""" if not skip_existence_check: try: # check if file exists if not filepath.exists(): raise FileNotFoundError(filepath) except PermissionError: pass if _s().check_path_is_child_of_root(filepath, storage.root): use_existing_storage_key = True return storage, use_existing_storage_key else: # check whether the path is part of one of the existing # already-registered storage locations result = None # within the hub, we don't want to perform check_path_in_existing_storage if using_key is None: result = check_path_in_existing_storage( filepath, check_hub_register_storage=setup_settings.instance.is_on_hub ) if isinstance(result, Storage): use_existing_storage_key = True return result, use_existing_storage_key else: # if the path is in the cloud, we have a good candidate # for the storage root: the bucket if not isinstance(filepath, LocalPathClasses): # for a cloud path, new_root is always the bucket name if filepath.protocol == "hf": hf_path = filepath.fs.resolve_path(filepath.as_posix()) if hasattr(hf_path, "root"): new_root = "hf://" + hf_path.root else: hf_path.path_in_repo = "" new_root = "hf://" + hf_path.unresolve().rstrip("/") else: if filepath.protocol == "s3": # check that endpoint_url didn't propagate here # as a part of the path string assert "?" not in filepath.path # noqa: S101 new_root = list(filepath.parents)[-1].as_posix().rstrip("/") # Re the Parallel execution of the logic below: # One of the threads (or processes) would start to write the hub record and then the test file. # The other ones would retrieve the hub record and the test file. # All of them would come out of the exercise with storage_record.instance_uid == setup_settings.instance.uid # and all of them would raise UnkownStorageLocation. # Then one of these threads will trigger storage_record.delete() but also this is idempotent; # this means they all throw the same error and deletion of the inexistent stuff (hub record, marker file) # would just silently fail. # Edge case: A user legitimately creates a storage location and another user runs this here at the exact same time. # There is no way to decide then which is the legitimate creation. storage_record = Storage(root=new_root).save() if storage_record.instance_uid == setup_settings.instance.uid: # we don't want to inadvertently create managed storage locations # hence, we revert the creation and throw an error storage_record.delete() raise UnknownStorageLocation( f"Path {filepath} is not contained in any known storage location:\n{Storage.to_dataframe()[['uid', 'root', 'type']]}\n\n" f"Create a managed storage location that contains the path, e.g., by calling: ln.Storage(root='{new_root}').save()" ) use_existing_storage_key = True return storage_record, use_existing_storage_key # if the filepath is local else: use_existing_storage_key = False # if the default storage is local we'll throw an error if the user # doesn't provide a key if storage.type == "local": return storage, use_existing_storage_key # if the default storage is in the cloud (the file is going to # be uploaded upon saving it), we treat the filepath as a cache else: return storage, use_existing_storage_key def process_data( provisional_uid: str, data: AnyPathStr | pd.DataFrame | AnnData, format: str | None, key: str | None, storage: Storage, using_key: str | None, skip_existence_check: bool = False, is_replace: bool = False, to_disk_kwargs: dict[str, Any] | None = None, ) -> tuple[Any, Path | UPath, str, Storage, bool]: """Serialize a data object that's provided as file or in memory. if not overwritten, data gets stored in default storage """ if with_package_obj(data, "AnnData", "anndata", lambda obj: True)[0]: is_anndata = True is_pathlike = False elif isinstance(data, (str, Path, UPath)): is_anndata = False is_pathlike = True else: is_anndata = False is_pathlike = False if key is not None: key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key") # use suffix as the (adata) format if the format is not provided if is_anndata and format is None and len(key_suffix) > 0: format = key_suffix[1:] else: key_suffix = None if is_pathlike: access_token = ( storage._access_token if hasattr(storage, "_access_token") else None ) path = create_path(data, access_token=access_token) # we don't resolve http links because they can resolve into a different domain # for example into a temporary url if path.protocol not in {"http", "https"}: path = path.resolve() storage, use_existing_storage_key = process_pathlike( path, storage=storage, using_key=using_key, skip_existence_check=skip_existence_check, ) suffix = extract_suffix_from_path(path) memory_rep = None elif ( is_anndata or data_is_dataframe(data) or data_is_scversedatastructure(data, "MuData") or data_is_scversedatastructure(data, "SpatialData") ): storage = storage memory_rep = data suffix = _s().infer_suffix(data, format) else: raise NotImplementedError( f"Do not know how to create an Artifact from {data}, pass a path instead." ) # Check for suffix consistency if key_suffix is not None and key_suffix != suffix and not is_replace: # consciously omitting a trailing period if is_pathlike: message = f"The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'." else: message = f"The passed key's suffix '{key_suffix}' must match the passed path's suffix '{suffix}'." raise InvalidArgument(message) # in case we have an in-memory representation, we need to write it to disk if memory_rep is not None: path = settings.cache_dir / f"{provisional_uid}{suffix}" logger.info("writing the in-memory object into cache") if to_disk_kwargs is None: to_disk_kwargs = {} _s().write_to_disk(data, path, **to_disk_kwargs) use_existing_storage_key = False return memory_rep, path, suffix, storage, use_existing_storage_key def get_stat_or_artifact( path: UPath, storage: Record, key: str | None = None, check_hash: bool = True, is_replace: bool = False, instance: str | None = None, skip_hash_lookup: bool = False, ) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]: """Retrieves file statistics or an existing artifact based on the path, hash, and key.""" n_files = None if settings.creation.artifact_skip_size_hash: return None, None, None, n_files, None stat = path.stat() # one network request if not isinstance(path, LocalPathClasses): size, hash, hash_type = None, None, None if stat is not None: # convert UPathStatResult to fsspec info dict stat = stat.as_info() if (store_type := stat["type"]) == "file": size, hash, hash_type = get_stat_file_cloud(stat) elif store_type == "directory": size, hash, hash_type, n_files = get_stat_dir_cloud(path) if hash is None: logger.warning(f"did not add hash for {path}") return size, hash, hash_type, n_files, None else: if path.is_dir(): size, hash, hash_type, n_files = hash_dir(path) else: size, hash, hash_type = hash_file(path) if not check_hash: return size, hash, hash_type, n_files, None # Empty files all share the same content hash; skip cross-artifact hash # lookup so creating a new empty file path yields a new artifact. if n_files is None and size == 0: skip_hash_lookup = True previous_artifact_version = None artifacts_qs = Artifact.objects.using(instance) if skip_hash_lookup: artifact_with_same_hash_exists = False if key is not None and not is_replace: # only search for a previous version of the artifact # ignoring hash queryset_same_hash_or_same_key = artifacts_qs.filter( ~Q(branch_id=-1), key=key, storage=storage, ).order_by("-created_at") else: queryset_same_hash_or_same_key = [] else: # this purposefully leaves out the storage location and key that we have # in the hard database unique constraints # so that the user is able to find artifacts with the same hash across # storage locations and keys # if this is not desired, set skip_hash_lookup=True if key is None or is_replace: queryset_same_hash = artifacts_qs.filter(~Q(branch_id=-1), hash=hash) artifact_with_same_hash_exists = queryset_same_hash.count() > 0 else: # the following query achieves one more thing beyond hash lookup # it allows us to find a previous version of the artifact based on # matching key & storage even if the hash is different # we do this here so that we don't have to do an additional query later # see the `previous_artifact_version` variable below queryset_same_hash_or_same_key = artifacts_qs.filter( ~Q(branch_id=-1), Q(hash=hash) | Q(key=key, storage=storage), ).order_by("-created_at") queryset_same_hash = queryset_same_hash_or_same_key.filter(hash=hash) artifact_with_same_hash_exists = queryset_same_hash.count() > 0 if key is not None and not is_replace: if ( not artifact_with_same_hash_exists and queryset_same_hash_or_same_key.count() > 0 ): logger.important( f"creating new artifact version for key '{key}' in storage '{storage.root}'" ) previous_artifact_version = queryset_same_hash_or_same_key[0] if artifact_with_same_hash_exists: artifact_with_same_hash = queryset_same_hash[0] logger.important( f"returning artifact with same hash: {artifact_with_same_hash}; to track this artifact as an input, use: ln.Artifact.get()" ) return artifact_with_same_hash else: return size, hash, hash_type, n_files, previous_artifact_version def check_path_in_existing_storage( path: Path | UPath, check_hub_register_storage: bool = False, using_key: str | None = None, ) -> Storage | None: for storage in Storage.objects.using(using_key).order_by(Length("root").desc()): # if path is part of storage, return it if _s().check_path_is_child_of_root(path, root=storage.root): return storage # we don't see parents registered in the db, so checking the hub # just check for 2 writable cloud protocols, maybe change in the future if check_hub_register_storage and getattr(path, "protocol", None) in {"s3", "gs"}: result = select_storage_or_parent(path.as_posix()) if result is not None: return Storage(**result, _skip_preparation=True).save() return None def get_relative_path_to_directory( path: PurePath | Path | UPath, directory: PurePath | Path | UPath ) -> PurePath | Path | UPath: if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses): # this is safer for cloud paths such as http paths relpath = PurePath( path.as_posix().replace(directory.as_posix(), "").lstrip("/") ) elif isinstance(directory, LocalPathClasses): relpath = path.resolve().relative_to(directory.resolve()) # type: ignore elif isinstance(directory, PurePath): relpath = path.relative_to(directory) else: raise TypeError("Directory not of type Path or UPath") return relpath def get_artifact_kwargs_from_data( *, data: Path | UPath | str | pd.DataFrame | ScverseDataStructures, key: str | None, run: Run | None, format: str | None, provisional_uid: str, version_tag: str | None, storage: Storage, using_key: str | None = None, is_replace: bool = False, skip_check_exists: bool = False, overwrite_versions: bool | None = None, skip_hash_lookup: bool = False, to_disk_kwargs: dict[str, Any] | None = None, key_is_virtual: bool | None = None, ): memory_rep, path, suffix, storage, use_existing_storage_key = process_data( provisional_uid, data, format, key, storage, using_key, skip_check_exists, is_replace=is_replace, to_disk_kwargs=to_disk_kwargs, ) check_path_in_storage = False real_key = None if use_existing_storage_key: inferred_key = get_relative_path_to_directory( path=path, directory=UPath(storage.root) ).as_posix() if key is None: key = inferred_key elif key != inferred_key: real_key = inferred_key check_path_in_storage = True else: storage = storage stat_or_artifact = get_stat_or_artifact( path=path, storage=storage, key=key, instance=using_key, is_replace=is_replace, skip_hash_lookup=skip_hash_lookup, ) if not isinstance(path, LocalPathClasses): local_filepath = None cloud_filepath = path else: local_filepath = path cloud_filepath = None privates = { "local_filepath": local_filepath, "cloud_filepath": cloud_filepath, "memory_rep": memory_rep, "check_path_in_storage": check_path_in_storage, } if isinstance(stat_or_artifact, Artifact): existing_artifact = stat_or_artifact # if the artifact was unsuccessfully saved, we want to # enable re-uploading after returning the artifact object # the upload is triggered by whether the privates are returned if existing_artifact._storage_ongoing: privates["key"] = key returned_privates = privates # re-upload necessary else: returned_privates = {"key": key} returned_privates["is_artifact_storage_managed_by_current_instance"] = ( existing_artifact.storage.instance_uid == setup_settings.instance.uid ) return existing_artifact, returned_privates else: size, hash, hash_type, n_files, revises = stat_or_artifact # update local path if revises is not None: # update provisional_uid provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag) if settings.cache_dir in path.parents: path = path.rename(path.with_name(f"{provisional_uid}{suffix}")) privates["local_filepath"] = path log_storage_hint( check_path_in_storage=check_path_in_storage, storage=storage, key=key, uid=provisional_uid, suffix=suffix, is_dir=n_files is not None, ) if overwrite_versions is None: overwrite_versions = n_files is not None if check_path_in_storage: # True here means that we have a path in an existing storage with a virtual key real_key_is_set = real_key is not None if key_is_virtual is not None and key_is_virtual != real_key_is_set: raise ValueError( f"Passing a path in an existing storage {'with' if real_key_is_set else 'without'} " f"a virtual key and _key_is_virtual={key_is_virtual} is incompatible." ) # we use an actual storage key if key is not provided explicitly set_key_is_virtual = real_key_is_set else: # do we use a virtual or an actual storage key? set_key_is_virtual = ( settings.creation._artifact_use_virtual_keys if key_is_virtual is None else key_is_virtual ) # needed to check if the artifact storage is managed by the current instance on artifact init privates["is_artifact_storage_managed_by_current_instance"] = ( storage.instance_uid == setup_settings.instance.uid ) kwargs = { "uid": provisional_uid, "suffix": suffix, "hash": hash, "_hash_type": hash_type, "key": key, "size": size, "storage_id": storage.id, "n_files": n_files, "_overwrite_versions": overwrite_versions, # True for folder, False for file "n_observations": None, # to implement "run_id": run.id if run is not None else None, "run": run, "_key_is_virtual": set_key_is_virtual, "revises": revises, "_real_key": real_key, } return kwargs, privates def log_storage_hint( *, check_path_in_storage: bool, storage: Storage | None, key: str | None, uid: str, suffix: str, is_dir: bool, ) -> None: hint = "" if check_path_in_storage: display_root = storage.root # type: ignore # check whether path is local if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore # if it's a local path, check whether it's in the current working directory root_path = Path(storage.root) # type: ignore if _s().check_path_is_child_of_root(root_path, Path.cwd()): # only display the relative path, not the fully resolved path display_root = root_path.relative_to(Path.cwd()) # type: ignore hint += f"path in storage '{display_root}'" # type: ignore else: hint += "path content will be copied to default storage upon `save()`" if key is None: storage_key = _s().auto_storage_key_from_artifact_uid(uid, suffix, is_dir) hint += f" with key `None` ('{storage_key}')" else: hint += f" with key '{key}'" logger.hint(hint) def data_is_dataframe(data: Any) -> bool: # TODO: maybe check also for pandas.DataFrame subclasses, # but in this case also infer_suffix should be updated return with_package_obj(data, "DataFrame", "pandas", lambda obj: True)[0] def data_is_scversedatastructure( data: ScverseDataStructures | AnyPathStr, structure_type: Literal["AnnData", "MuData", "SpatialData"] | None = None, cloud_warning: bool = True, ) -> bool: """Determine whether a specific in-memory object or a path is any or a specific scverse data structure.""" file_suffix = None if structure_type == "AnnData": file_suffix = ".h5ad" elif structure_type == "MuData": file_suffix = ".h5mu" # SpatialData does not have a unique suffix but `.zarr` # AnnData allows both AnnDataAccessor and AnnData class_name = data.__class__.__name__ if structure_type is None: return any( class_name in (["AnnData", "AnnDataAccessor"] if cl_name == "AnnData" else [cl_name]) for cl_name in ["AnnData", "MuData", "SpatialData"] ) elif class_name in ( ["AnnData", "AnnDataAccessor"] if structure_type == "AnnData" else [structure_type] ): return True data_type = structure_type.lower() if isinstance(data, (str, Path, UPath)): data_path = UPath(data) if file_suffix in data_path.suffixes: return True if data_path.suffix == ".zarr": type_suffix = f".{data_type}" if type_suffix in data_path.suffixes: return True # check only for local, expensive for cloud if fsspec.utils.get_protocol(data_path.as_posix()) == "file": return ( _identify_zarr_type( data_path if structure_type == "AnnData" else data, check=True if structure_type == "AnnData" else False, ) == data_type ) elif cloud_warning: logger.warning( f"we do not check whether cloud zarr is {structure_type}" ) return False return False def data_is_soma_experiment(data: SOMAExperiment | AnyPathStr) -> bool: # We are not importing tiledbsoma here to keep loaded modules minimal if hasattr(data, "__class__") and data.__class__.__name__ == "Experiment": return True if isinstance(data, (str, Path, UPath)): return UPath(data).suffix == ".tiledbsoma" return False def check_otype_artifact( data: AnyPathStr | pd.DataFrame | ScverseDataStructures, otype: str | None = None, cloud_warning: bool = True, ) -> str: if otype is not None: return otype if isinstance(data, (str, Path, UPath)): is_pathlike = True suffix = UPath(data).suffix else: is_pathlike = False suffix = None if (is_pathlike and suffix in {".parquet", ".csv", ".ipc"}) or data_is_dataframe( data ): logger.warning("data is a DataFrame, please use .from_dataframe()") otype = "DataFrame" return otype if data_is_scversedatastructure(data, "AnnData", cloud_warning): if not is_pathlike: logger.warning("data is an AnnData, please use .from_anndata()") otype = "AnnData" elif data_is_scversedatastructure(data, "MuData", cloud_warning): if not is_pathlike: logger.warning("data is a MuData, please use .from_mudata()") otype = "MuData" elif data_is_scversedatastructure(data, "SpatialData", cloud_warning): if not is_pathlike: logger.warning("data is a SpatialData, please use .from_spatialdata()") otype = "SpatialData" elif not is_pathlike: raise TypeError("data has to be a string, Path, UPath") return otype def populate_subsequent_run(record: Artifact | Collection, run: Run | None) -> None: if run is None: return if record.run is None: record.run = run elif record.run != run: record.recreating_runs.add(run) record._subsequent_run_id = run.id # also see current_run() in core._data def get_run(run: Run | None) -> Run | None: from ..core._context import context from ..core._functions import get_current_tracked_run if run is None: run = get_current_tracked_run() if run is None: run = context.run if run is None and not settings.creation.artifact_silence_missing_run_warning: isettings = setup_settings.instance if not (isettings._is_clone or isettings.is_read_only_connection): logger.warning(WARNING_RUN_TRANSFORM) # suppress run by passing False elif not run: run = None return run def save_staged_schemas(self: Artifact) -> None: if hasattr(self, "_staged_schemas"): from lamindb.models._feature_manager import get_schema_by_slot_ existing_staged_schemas = get_schema_by_slot_(self) saved_staged_schemas = {} for key, schema in self._staged_schemas.items(): if isinstance(schema, Schema) and schema._state.adding: schema.save() saved_staged_schemas[key] = schema if key in existing_staged_schemas: # remove existing feature set on the same slot self.schemas.remove(existing_staged_schemas[key]) if len(saved_staged_schemas) > 0: s = "s" if len(saved_staged_schemas) > 1 else "" display_schema_keys = ",".join( f"'{key}'" for key in saved_staged_schemas.keys() ) logger.save( f"saved {len(saved_staged_schemas)} feature set{s} for slot{s}:" f" {display_schema_keys}" ) def save_schema_links(self: Artifact) -> None: from lamindb.models.save import bulk_create if hasattr(self, "_staged_schemas"): links = [] for slot, schema in self._staged_schemas.items(): kwargs = { "artifact_id": self.id, "schema_id": schema.id, "slot": slot, } links.append(Artifact.schemas.through(**kwargs)) bulk_create(links, ignore_conflicts=True) def validate_feature(feature: Feature, records: list[SQLRecord]) -> None: """Validate feature record, adjust feature.dtype based on labels records.""" if not isinstance(feature, Feature): raise TypeError("feature has to be of type Feature") if feature._state.adding: registries = {record.__class__.__get_name_with_module__() for record in records} registries_str = "|".join(registries) msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()" raise ValidationError(f"Feature not validated. If it looks correct: {msg}") def get_labels( self, feature: Feature, mute: bool = False, flat_names: bool = False, ) -> QuerySet | dict[str, QuerySet] | list: """{}""" # noqa: D415 from .record import Record if not isinstance(feature, Feature): raise TypeError("feature has to be of type Feature") dtype_str = feature._dtype_str if dtype_str is None or not dtype_str.startswith("cat["): raise ValueError("feature does not have linked labels") registries_to_check = dtype_str.replace("cat[", "").rstrip("]").split("|") if len(registries_to_check) > 1 and not mute: logger.warning("labels come from multiple registries!") # return an empty query set if self.id is still None if self.id is None: return QuerySet(self.__class__) qs_by_registry = {} for registry in registries_to_check: # currently need to distinguish between ULabel and non-ULabel, because # we only have the feature information for Label if registry in {"ULabel", "Record"}: links_to_labels = get_label_links(self, registry, feature) label_ids = [ (link.ulabel_id if registry == "ULabel" else link.record_id) for link in links_to_labels ] model = ULabel if registry == "ULabel" else Record qs_by_registry[registry] = model.objects.using(self._state.db).filter( id__in=label_ids ) elif registry in self.features._accessor_by_registry: qs_by_registry[registry] = getattr( self, self.features._accessor_by_registry[registry] ).all() if flat_names: # returns a flat list of names from .sqlrecord import get_name_field values = [] for v in qs_by_registry.values(): values += v.to_list(get_name_field(v)) return values if len(registries_to_check) == 1 and registry in qs_by_registry: return qs_by_registry[registry] else: return qs_by_registry def add_labels( self, records: SQLRecord | list[SQLRecord] | QuerySet | Iterable, feature: Feature | None = None, *, field: StrField | None = None, from_curator: bool = False, ) -> None: """{}""" # noqa: D415 if self._state.adding: raise ValueError("Please save the artifact/collection before adding a label!") if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both records = records.to_list() if isinstance(records, (str, SQLRecord)): records = [records] if not isinstance(records, list): # avoids warning for pd Series records = list(records) # create records from values if len(records) == 0: return None if isinstance(records[0], str): # type: ignore records_validated = [] # feature is needed if we want to create records from values if feature is None: raise ValueError( "Please pass a feature, e.g., via: label = ln.ULabel(name='my_label'," " feature=ln.Feature(name='my_feature'))" ) dtype_str = feature._dtype_str if dtype_str.startswith("cat["): orm_dict = dict_module_name_to_model_name(Artifact) for reg in dtype_str.replace("cat[", "").rstrip("]").split("|"): registry = orm_dict.get(reg) records_validated += registry.from_values(records, field=field) # feature doesn't have registries and therefore can't create records from values # ask users to pass records if len(records_validated) == 0: raise ValueError( "Please pass a record (a `SQLRecord` object), not a string, e.g., via:" " label" f" = ln.Record(name='{records[0]}')" # type: ignore ) records = records_validated for record in records: if record._state.adding: raise ValidationError( f"{record} not validated. If it looks correct: record.save()" ) if feature is None: d = dict_related_model_to_related_name(self.__class__) # strategy: group records by registry to reduce number of transactions records_by_related_name: dict = {} for record in records: related_name = d.get(record.__class__.__get_name_with_module__()) if related_name is None: raise ValueError(f"Can't add labels to {record.__class__} record!") if related_name not in records_by_related_name: records_by_related_name[related_name] = [] records_by_related_name[related_name].append(record) for related_name, records in records_by_related_name.items(): getattr(self, related_name).add(*records) else: validate_feature(feature, records) # type:ignore records_by_registry = defaultdict(list) schemas = self.schemas.filter(itype="Feature") internal_features = set() # type: ignore if len(schemas) > 0: for schema in schemas: internal_features = internal_features.union( set(schema.members.values_list("name", flat=True)) ) # type: ignore for record in records: records_by_registry[record.__class__.__get_name_with_module__()].append( record ) for registry_name, records in records_by_registry.items(): if not from_curator and feature.name in internal_features: raise ValidationError( "Cannot manually annotate a feature measured *within* the dataset. Please use a Curator." ) dtype_str = feature._dtype_str if registry_name not in dtype_str: if not dtype_str.startswith("cat"): raise ValidationError( f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{dtype_str}'" ) if registry_name not in dtype_str: new_dtype = dtype_str.rstrip("]") + f"|{registry_name}]" raise ValidationError( f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{dtype_str}'), consider a feature with dtype='{new_dtype}'" ) if registry_name not in self.features._accessor_by_registry: logger.warning(f"skipping {registry_name}") continue if len(records) == 0: continue features_labels = { registry_name: [(feature, label_record) for label_record in records] } self.features._add_label_feature_links( features_labels, ) def delete_permanently(artifact: Artifact, storage: bool | None, using_key: str): # need to grab file path before deletion try: path, _ = _s().filepath_from_artifact(artifact, using_key) except OSError: # we can still delete the record logger.warning("Could not get path") storage = False # only delete in storage if DB delete is successful # DB delete might error because of a foreign key constraint violated etc. if artifact._overwrite_versions and artifact.is_latest: logger.important( "deleting all versions of this artifact because they all share the same store" ) # artifact.versions pulls only versions that are not in trash # this query set below contains all versions including those that are in trash versions = Artifact.objects.using(artifact._state.db).filter( uid__startswith=artifact.stem_uid ) for version in versions: _delete_skip_storage(version) else: artifact._delete_skip_storage() # by default do not delete storage if deleting only a previous version # and the underlying store is mutable if artifact._overwrite_versions and not artifact.is_latest: delete_in_storage = False if storage: logger.warning( "storage argument is ignored; can't delete store of a previous version if overwrite_versions is True" ) elif artifact.key is None or ( artifact._key_is_virtual and artifact._real_key is None ): # do not ask for confirmation also if storage is None delete_in_storage = storage is None or storage else: # for artifacts with non-virtual semantic storage keys (key is not None) # ask for extra-confirmation if storage is None # the wording here is critical to avoid accidental deletions if storage is None: response = input( f"Artifact record deleted. Do you ALSO want to delete the data in storage at {path}? (y/n) You can't undo" " this action." ) delete_in_storage = response == "y" else: delete_in_storage = storage if not delete_in_storage: logger.important(f"a file/folder remains here: {path}") # we don't yet have logic to bring back the deleted metadata record # in case storage deletion fails - this is important for ACID down the road if delete_in_storage: delete_msg = _s().delete_storage(path, raise_file_not_found_error=False) if delete_msg != "did-not-delete": logger.success(f"deleted {colors.yellow(f'{path}')}") class LazyArtifact: """Lazy artifact for streaming to auto-generated internal paths. This is needed when it is desirable to stream to a `lamindb` auto-generated internal path and register the path as an artifact (see :class:`~lamindb.Artifact`). This object creates a real artifact on `.save()` with the provided arguments. Args: suffix: The suffix for the auto-generated internal path overwrite_versions: Whether to overwrite versions. **kwargs: Keyword arguments for the artifact to be created. Examples: Create a lazy artifact, write to the path and save to get a real artifact:: lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr") zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) # stream to the path artifact = lazy.save() """ def __init__(self, suffix: str, overwrite_versions: bool, **kwargs): self.kwargs = kwargs self.kwargs["overwrite_versions"] = overwrite_versions if (key := kwargs.get("key")) is not None and extract_suffix_from_path( PurePosixPath(key) ) != suffix: raise ValueError( "The suffix argument and the suffix of key should be the same." ) uid, _ = create_uid(n_full_id=20) storage_key = _s().auto_storage_key_from_artifact_uid( uid, suffix, overwrite_versions=overwrite_versions ) storepath = setup_settings.storage.root / storage_key self._path = storepath @property def path(self) -> UPath: return self._path def save(self, upload: bool | None = None, **kwargs) -> Artifact: artifact = Artifact(self.path, _is_internal_call=True, **self.kwargs) return artifact.save(upload=upload, **kwargs) def __repr__(self) -> str: # pragma: no cover show_kwargs = {k: v for k, v in self.kwargs.items() if v is not None} return ( f"LazyArtifact object with\n path: {self.path}\n arguments: {show_kwargs}" ) T = TypeVar("T", bound=BaseSQLRecord) def _sqlrecord_or_id( model: type[T], sqlrecord: T | None, sqlrecord_id: int | None, check_type: bool = True, ) -> T | None: if sqlrecord is not None and sqlrecord_id is not None: raise ValueError( f"Do not pass both {model.__name__} and its id at the same time." ) if sqlrecord is None and sqlrecord_id is None: return None elif sqlrecord is not None: assert not check_type or isinstance(sqlrecord, model), ( f"Expected {model.__name__}, got {type(sqlrecord).__name__}." ) return sqlrecord elif sqlrecord_id is not None: return model.objects.get(id=sqlrecord_id) class Artifact(SQLRecord, IsVersioned, TracksRun, TracksUpdates): """Datasets & models stored as files, folders, or arrays. Some artifacts are table- or array-like, e.g., when stored as `.parquet`, `.h5ad`, `.zarr`, or `.tiledb`. Args: path: `AnyPathStr` A path to a local or remote folder or file from which to create the artifact. key: `str | None = None` A key within the storage location, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family. description: `str | None = None` A description. kind: `Literal["dataset", "model"] | str | None = None` Distinguish models from datasets from other files & folders. features: `dict | None = None` External features to annotate via :class:`~lamindb.models.FeatureManager.set_values`. schema: `Schema | None = None` A schema to validate features. revises: `Artifact | None = None` Previous version of the artifact. An alternative to passing `key` when creating a new version. overwrite_versions: `bool | None = None` Whether to overwrite versions. Defaults to `True` for folders and `False` for files. run: `Run | bool | None = None` The run that creates the artifact. If `False`, suppress tracking the run. If `None`, infer the run from the global run context. branch: `Branch | None = None` The branch of the artifact. If `None`, uses the current branch. space: `Space | None = None` The space of the artifact. If `None`, uses the current space. storage: `Storage | None = None` The storage location for the artifact. If `None`, uses the default (:attr:`~lamindb.core.Settings.storage`). skip_hash_lookup: `bool = False` Skip the hash lookup so that a new artifact is created even if an artifact with the same hash already exists. Empty files are always treated as if this were `True` because empty content hashes are not used for deduplication. Examples: Create an artifact **from a local file or folder**:: artifact = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save() artifact = ln.Artifact("./my_folder", key="project1/my_folder").save() Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance. If you create an artifact **from a remote file or folder**, lamindb registers the S3 `key` and avoids copying the data:: artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save() # can omit key/description because file is remote If you then want to query & access the artifact later on, this is how you do it:: artifact = ln.Artifact.get(key="examples/my_file.parquet") cached_path = artifact.cache() # sync to local cache & get local path If the storage format supports it, you can load the artifact directly into memory or query it through a streaming interface, e.g., for parquet files:: df = artifact.load() # load parquet file as DataFrame pyarrow_dataset = artifact.open() # open a streaming file-like object To bulk-create artifacts for every file in a directory and **group them in a folder**, use :meth:`~lamindb.Artifact.from_dir`:: artifacts = ln.Artifact.from_dir("project_alpha/run_001").save() # create one artifact per file in the directory artifacts = ln.Artifact.filter(key__startswith="project_alpha/run_001/") # query ingested artifacts via the folder prefix To create a **versioned immutable collection** of artifacts for a data release, use :class:`~lamindb.Collection`:: collection = ln.Collection(artifacts, key="project_alpha/run_001").save() .. dropdown:: Virtual folders (key prefixes) vs. :class:`~lamindb.Collection` objects - prefix query on `key`: If a colleague adds a new file to that prefix tomorrow, your `filter(key__startswith=...)` result will change. - collection: A collection object provides a `uid` for every version and its content won't change. If you want to **validate & annotate** a dataframe or an array using the feature & label registries, pass `schema` to one of the `.from_dataframe()`, `.from_anndata()`, ... constructors:: artifact = ln.Artifact.from_dataframe( "./my_file.parquet", key="my_dataset.parquet", schema="valid_features" ).save() To annotate by **external features**:: artifact = ln.Artifact("./my_file.parquet", features={"cell_type_by_model": "T cell"}).save() You can make a **new version** of an artifact by passing an existing `key`:: artifact_v2 = ln.Artifact("./my_file.parquet", key="examples/my_file.parquet").save() artifact_v2.versions.to_dataframe() # see all versions You can write artifacts to **non-default storage locations** by passing the `storage` argument:: storage_loc = ln.Storage.get(root="s3://my_bucket") # get storage location, or create via ln.Storage(root="s3://my_bucket").save() ln.Artifact("./my_file.parquet", key="examples/my_file.parquet", storage=storage_loc).save() # upload to s3://my_bucket Notes: .. _storage-formats-note: .. dropdown:: Storage formats & object types The `Artifact` registry tracks the storage format via :attr:`suffix` and an abstract object type via :attr:`otype`. ================ ====================================== ================ ==================================================================== description :attr:`suffix` :attr:`otype` Python type examples ================ ====================================== ================ ==================================================================== table `.csv`, `.tsv`, `.parquet`, `.ipc` `"DataFrame"` `pandas.DataFrame`, `polars.DataFrame`, `pyarrow.Table` annotated matrix `.h5ad`, `.zarr`, `.h5mu` `"AnnData"` `anndata.AnnData` stacked matrix `.zarr` `"MuData"` `mudata.MuData` `.tiledbsoma` `"tiledbsoma"` `tiledbsoma.Experiment` spatial data `.zarr` `"SpatialData"` `spatialdata.SpatialData` generic arrays `.h5`, `.zarr`, `.tiledb` --- `h5py.Dataset`, `zarr.Array`, `tiledb.Array` unstructured `.fastq`, `.pdf`, `.vcf`, `.html` --- --- ================ ====================================== ================ ==================================================================== You can map storage formats onto **R types**, e.g., an `AnnData` might be accessed via `anndataR`. Because `otype` accepts any `str`, you can define custom object types that enable queries & logic that you need, e.g., `"SingleCellExperiment"` or `"MyCustomZarrDataStructure"`. LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file). .. dropdown:: Will artifacts get duplicated? If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. Exception: empty files are not deduplicated by hash and create a new artifact. In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()` detects the duplication and will return the existing artifact. .. dropdown:: I cannot come up with a good file name, can I avoid mapping artifacts into a hierarchy? Sometimes you want to **avoid mapping the artifact into a path hierarchy**. You can do so by omitting the `key` argument and only passing `description`. However, note that a shared `description` does not trigger mapping artifacts into the same version family. artifact = ln.Artifact("./my_folder", description="My folder").save() artifact_v2 = ln.Artifact("./my_folder", revises=old_artifact).save() # need to version based on `revises`, a shared description does not trigger a new version .. dropdown:: Why does the constructor look the way it looks? It's inspired by APIs building on AWS S3. Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument. In `boto3 `__:: # signature: S3.Bucket.upload_file(filepath, key) import boto3 s3 = boto3.resource('s3') bucket = s3.Bucket('mybucket') bucket.upload_file('/tmp/hello.txt', 'hello.txt') In `quilt3 `__:: # signature: quilt3.Bucket.put_file(key, filepath) import quilt3 bucket = quilt3.Bucket('mybucket') bucket.put_file('hello.txt', '/tmp/hello.txt') See Also: :class:`~lamindb.Storage` Storage locations for artifacts. :class:`~lamindb.Collection` Collections of artifacts. :meth:`~lamindb.Artifact.from_dir` Bulk-create artifacts for each file in a directory. :meth:`~lamindb.Artifact.from_dataframe` Create an artifact from a `DataFrame`. :meth:`~lamindb.Artifact.from_anndata` Create an artifact from an `AnnData`. :meth:`~lamindb.Artifact.from_spatialdata` Create an artifact from a `SpatialData`. :meth:`~lamindb.Artifact.from_mudata` Create an artifact from a `MuData`. :meth:`~lamindb.Artifact.from_tiledbsoma` Create an artifact from a `tiledbsoma` store. :meth:`~lamindb.Artifact.from_lazy` Create a lazy artifact for streaming to auto-generated internal paths. """ class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" constraints = [ # a simple hard unique constraint on `hash` clashes with the fact # that pipelines sometimes aim to ingest the exact same file in different # folders # the conditional composite constraint allows duplicating files in different parts of the # file hierarchy, but errors if the same file is to be registered with the same key # In SQL, NULL values are treated specially in unique constraints. # Multiple NULL values are not considered equal to each other for uniqueness purposes. # For non-NULL keys models.UniqueConstraint( fields=["storage", "key", "hash"], condition=models.Q(key__isnull=False), name="unique_artifact_storage_key_hash_not_null", ), # For NULL keys (only storage + hash need to be unique) models.UniqueConstraint( fields=["storage", "hash"], condition=models.Q(key__isnull=True), name="unique_artifact_storage_hash_null_key", ), ] _TRACK_FIELDS = ("space_id", "is_latest", "suffix", "key") _len_full_uid: int = 20 _len_stem_uid: int = 16 _name_field: str = "key" @property def features(self) -> FeatureManager: """Feature manager. Annotate an artifact with features:: artifact.features.set_values({ "species": "human", "scientist": ['Barbara McClintock', 'Edgar Anderson'], "temperature": 27.6, "experiment": "Experiment 1" }) Query artifacts by features:: ln.Artifact.filter(scientist="Barbara McClintock") Get all feature annotations as a dictionary:: d = artifact.features.get_values() Get a value for a single feature:: organism = artifact.features["species"] # returns an Organism object, not "human" temperature = artifact.features["temperature"] # returns a temperature value, a float Note that `get_values()` returns identifiers for categorical values (for example, the string "human" for an `Organism`), while the `[]` accessor returns the corresponding Python object. See also :meth:`~lamindb.models.FeatureManager.set_values`. .. dropdown:: Dataset features vs. external features Features may or may not be stored in the dataset, i.e., the artifact content in storage. If you pass a schema to :class:`~lamindb.Artifact.from_dataframe` you validate the columns of the `DataFrame` and annotate with values parsed from these columns. `artifact.features.set_values()`, by contrast, does **not** validate the content of the artifact. """ from ._feature_manager import FeatureManager return FeatureManager(self) @property def labels(self) -> LabelManager: """Label manager. A way to access all label annotations of an artifact, irrespective of their type. To annotate with labels, use the type-specific accessor, for example:: experiment = ln.Record(name="Experiment 1").save() artifact.records.add(experiment) project = ln.Project(name="Project A").save() artifact.projects.add(project) """ from ._label_manager import LabelManager return LabelManager(self) id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=_len_full_uid ) """A universal random id.""" # the max length of 1024 equals the max length of a S3 key key: str | None = CharField(db_index=True, null=True, max_length=1024) """A (virtual) relative file path within the artifact's storage location. Setting a `key` is useful to automatically group artifacts into a version family. LaminDB defaults to a virtual file path to make renaming of data in object storage easy. If you register existing files in a storage location, the `key` equals the actual filepath on the underyling filesytem or object store. """ _real_key: str | None = CharField(db_index=True, null=True, max_length=1024) """An optional real storage key.""" # db_index on description because sometimes we query for equality in the case of artifacts description: str | None = TextField(null=True, db_index=True) """A description.""" storage: Storage = ForeignKey( Storage, PROTECT, related_name="artifacts", editable=False ) """Storage location, e.g. an S3 or GCP bucket or a local directory ← :attr:`~lamindb.Storage.artifacts`.""" suffix: str = CharField(max_length=30, db_index=True, editable=False) # Initially, we thought about having this be nullable to indicate folders # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix """The path suffix or an empty string if no suffix exists. This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "". """ kind: ArtifactKind | str | None = CharField( max_length=20, db_index=True, null=True, ) """:class:`~lamindb.base.types.ArtifactKind` or custom `str` value (default `None`).""" otype: ( Literal["DataFrame", "AnnData", "MuData", "SpatialData", "tiledbsoma"] | str | None ) = CharField(max_length=64, db_index=True, null=True, editable=False) """The object type represented as a string. The field is automatically set when using the `from_dataframe()`, `from_anndata()`, ... constructors. Unstructured artifacts have `otype=None`. The field also accepts custom `str` values to allow for building logic around them in third-party packages. See section `storage formats & object types `__ for more background. """ size: int | None = BigIntegerField( null=True, db_index=True, default=None, editable=False ) """The size in bytes. Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc. """ hash: str | None = CharField( max_length=HASH_LENGTH, db_index=True, null=True, editable=False ) """The hash or pseudo-hash of the artifact content in storage. Useful to ascertain integrity and avoid duplication. Different versions of the artifact have different hashes. """ n_files: int | None = BigIntegerField( null=True, db_index=True, default=None, editable=False ) """The number of files for folder-like artifacts. Is `None` for file-like artifacts. Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`. """ n_observations: int | None = BigIntegerField( null=True, db_index=True, default=None, editable=False ) """The number of observations in this artifact. Typically, this denotes the first array dimension. """ _hash_type: str | None = CharField( max_length=30, db_index=True, null=True, editable=False ) """Type of hash.""" run: Run | None = ForeignKey( Run, PROTECT, related_name="output_artifacts", null=True, default=None, editable=False, ) """The run that created the artifact ← :attr:`~lamindb.Run.output_artifacts`.""" input_of_runs: RelatedManager[Run] = models.ManyToManyField( Run, related_name="input_artifacts" ) """The runs that use this artifact as an input ← :attr:`~lamindb.Run.input_artifacts`.""" recreating_runs: RelatedManager[Run] = models.ManyToManyField( "Run", related_name="recreated_artifacts", ) """The runs that re-created the artifact after its initial creation ← :attr:`~lamindb.Run.recreated_artifacts`.""" collections: RelatedManager[Collection] """The collections that this artifact is part of ← :attr:`~lamindb.Collection.artifacts`.""" schema: Schema | None = ForeignKey( Schema, PROTECT, null=True, default=None, related_name="validated_artifacts", ) """The validating schema of this artifact ← :attr:`~lamindb.Schema.validated_artifacts`. The validating schema is helpful to query artifacts that were validated by the same schema. """ schemas: RelatedManager[Schema] = models.ManyToManyField( Schema, related_name="artifacts", through="ArtifactSchema" ) """The inferred schemas of this artifact ← :attr:`~lamindb.Schema.artifacts`. The inferred schemas are helpful to answer the question: "Which features are present in the artifact?" The validating schema typically allows a range of valid actual dataset schemas. The inferred schemas link the actual schemas of the artifact, and are auto-generated by parsing the artifact content during validation. """ json_values: RelatedManager[JsonValue] = models.ManyToManyField( JsonValue, through="ArtifactJsonValue", related_name="artifacts" ) """The feature-indexed JSON values annotating this artifact ← :attr:`~lamindb.JsonValue.artifacts`.""" _key_is_virtual: bool = BooleanField() """Indicates whether `key` is virtual or part of an actual file path.""" # be mindful that below, passing related_name="+" leads to errors _actions: RelatedManager[Artifact] = models.ManyToManyField( "self", symmetrical=False, related_name="_action_targets" ) """The actions to attach for the UI.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="created_artifacts", editable=False, ) """The creator of this artifact ← :attr:`~lamindb.User.created_artifacts`.""" _overwrite_versions: bool = BooleanField(default=None) """See corresponding property `overwrite_versions`.""" ulabels: RelatedManager[ULabel] """The ulabels annotating this artifact ← :attr:`~lamindb.ULabel.artifacts`.""" users: RelatedManager[User] """The users annotating this artifact ← :attr:`~lamindb.User.artifacts`.""" projects: RelatedManager[Project] """The projects annotating this artifact ← :attr:`~lamindb.Project.artifacts`.""" references: RelatedManager[Reference] """The references annotating this artifact ← :attr:`~lamindb.Reference.artifacts`.""" records: RelatedManager[Record] """The records annotating this artifact ← :attr:`~lamindb.Record.artifacts`.""" runs: RelatedManager[Run] """The runs annotating this artifact ← :attr:`~lamindb.Run.artifacts`.""" linked_by_runs: RelatedManager[Run] """The runs linking this artifact ← :attr:`~lamindb.Run.linked_by_artifacts`.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", through="ArtifactArtifact", symmetrical=False, related_name="linked_by_artifacts", ) """The annotating artifacts of this artifact ← :attr:`~lamindb.Artifact.linked_by_artifacts`.""" linked_by_artifacts: RelatedManager[Artifact] """The artifacts annotated by this artifact ← :attr:`~lamindb.Artifact.artifacts`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordArtifact", related_name="linked_artifacts" ) """The records linking this artifact as a feature value ← :attr:`~lamindb.Record.linked_artifacts`.""" ablocks: RelatedManager[ArtifactBlock] """Attached blocks ← :attr:`~lamindb.ArtifactBlock.artifact`.""" @overload def __init__( self, path: AnyPathStr, *, key: str | None = None, description: str | None = None, kind: ArtifactKind | str | None = None, features: dict[str, Any] | None = None, schema: Schema | None = None, revises: Artifact | None = None, overwrite_versions: bool | None = None, run: Run | False | None = None, storage: Storage | None = None, branch: Branch | None = None, space: Space | None = None, skip_hash_lookup: bool = False, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): # check whether we are called with db args if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None # now proceed with the user-facing constructor if len(args) > 1: raise ValueError("Only one non-keyword arg allowed: path") if "data" in kwargs: warnings.warn( "`data` argument was renamed to `path` and will be removed in a future release.", DeprecationWarning, stacklevel=2, ) path = kwargs.pop("data") else: path = kwargs.pop("path") if len(args) == 0 else args[0] kind: str = kwargs.pop("kind", None) key: str | None = kwargs.pop("key", None) using_key = kwargs.pop("using_key", None) description: str | None = kwargs.pop("description", None) revises: Artifact | None = kwargs.pop("revises", None) if revises is not None: if not isinstance(revises, Artifact): raise TypeError("`revises` has to be of type `Artifact`") if description is None: description = revises.description overwrite_versions: bool | None = kwargs.pop("overwrite_versions", None) version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None)) features: dict[str, Any] | None = kwargs.pop("features", None) skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False) to_disk_kwargs: dict[str, Any] | None = kwargs.pop("to_disk_kwargs", None) format = kwargs.pop("format", None) _key_is_virtual = kwargs.pop("_key_is_virtual", None) _is_internal_call = kwargs.pop("_is_internal_call", False) skip_check_exists = kwargs.pop("skip_check_exists", False) if key is not None and _s().AUTO_KEY_PREFIX in key: raise ValueError( f"Do not pass key that contains a managed storage path in `{_s().AUTO_KEY_PREFIX}`" ) # below is for internal calls that require defining the storage location # ahead of constructing the Artifact if isinstance(path, (str, Path, UPath)) and _s().AUTO_KEY_PREFIX in str(path): if _is_internal_call: if _key_is_virtual is False: raise ValueError( "Do not pass _key_is_virtual=False with _is_internal_call=True." ) is_automanaged_path = True user_provided_key = key key = None else: raise ValueError( f"Do not pass path inside the `{_s().AUTO_KEY_PREFIX}` directory." ) else: is_automanaged_path = False # validate external features if passed with a schema schema: Schema | None = _sqlrecord_or_id( Schema, kwargs.pop("schema", None), kwargs.pop("schema_id", None) ) if features is not None: self._external_features = features if schema is not None: from lamindb.curators.core import ExperimentalDictCurator validation_schema = schema ExperimentalDictCurator(features, validation_schema).validate() # check_type is False because run can be False also, see get_run run: Run | None | bool = _sqlrecord_or_id( Run, kwargs.pop("run", None), kwargs.pop("run_id", None), check_type=False ) branch: Branch | None = _sqlrecord_or_id( Branch, kwargs.pop("branch", None), kwargs.pop("branch_id", None) ) space: Space | None = _sqlrecord_or_id( Space, kwargs.pop("space", None), kwargs.pop("space_id", None) ) storage: Storage | None = _sqlrecord_or_id( Storage, kwargs.pop("storage", None), kwargs.pop("storage_id", None) ) storage_was_passed = False if storage is not None: storage_was_passed = True elif ( setup_settings.instance.keep_artifacts_local and setup_settings.instance._local_storage is not None ): storage = setup_settings.instance.local_storage.record else: storage = setup_settings.instance.storage.record if space is None: from lamindb import context as run_context if run_context.space is not None: space = run_context.space elif setup_settings.space is not None: space = setup_settings.space # space - storage consistency is also checked in .save() when the space is changed if space is not None and space.id != storage.space_id: if storage_was_passed: logger.warning( "storage argument ignored as storage information from space takes precedence" ) storage_locs_for_space = Storage.filter( space=space, instance_uid=setup_settings.instance.uid ).order_by("id") n_storage_locs_for_space = storage_locs_for_space.count() if n_storage_locs_for_space == 0: raise NoStorageLocationForSpace( "No storage location found for space.\n" "Either create one via ln.Storage(root='create-s3', space=space).save()\n" "Or start managing access to an existing storage location via the space: storage_loc.space = space; storage.save()" ) else: storage = storage_locs_for_space.first() if n_storage_locs_for_space > 1: other_storage_locs = ",".join( f"{s.root}" for s in storage_locs_for_space[1:] ) logger.warning( f"more than one storage location is managed by this instance for space {space},\n" f"choosing root={storage.root}\n" ) logger.important_hint( f"to choose one of the other storage locations ({other_storage_locs}), pass `storage` to the Artifact constructor" ) otype = kwargs.pop("otype") if "otype" in kwargs else None if isinstance(path, str) and path.startswith("s3:///"): # issue in Groovy / nf-lamin producing malformed S3 paths # https://laminlabs.slack.com/archives/C08J590666Q/p1751315027830849?thread_ts=1751039961.479259&cid=C08J590666Q path = path.replace("s3:///", "s3://") otype = check_otype_artifact( data=path, otype=otype, cloud_warning=not _is_internal_call ) if "type" in kwargs: logger.warning("`type` will be removed soon, please use `kind`") kind = kwargs.pop("type") if not len(kwargs) == 0: valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)]) raise FieldValidationError( f"Only {valid_keywords} can be passed, you passed: {kwargs}" ) if revises is not None and key is not None and revises.key != key: logger.warning(f"renaming artifact from '{revises.key}' to {key}") provisional_uid, revises = create_uid(revises=revises, version_tag=version_tag) run = get_run(run) kwargs_or_artifact, privates = get_artifact_kwargs_from_data( data=path, key=key, run=run, format=format, provisional_uid=provisional_uid, version_tag=version_tag, storage=storage, using_key=using_key, skip_check_exists=skip_check_exists, overwrite_versions=overwrite_versions, skip_hash_lookup=skip_hash_lookup, to_disk_kwargs=to_disk_kwargs, key_is_virtual=_key_is_virtual, ) def set_private_attributes(): if path is not None and "local_filepath" in privates: self._local_filepath = privates["local_filepath"] self._cloud_filepath = privates["cloud_filepath"] self._memory_rep = privates["memory_rep"] self._to_store = not privates["check_path_in_storage"] if ( self._to_store and not privates["is_artifact_storage_managed_by_current_instance"] ): raise ValueError( "Cannot create an artifact in a storage location that is not managed by the current instance." ) # an object with the same hash already exists if isinstance(kwargs_or_artifact, Artifact): from .sqlrecord import init_self_from_db, update_attributes init_self_from_db(self, kwargs_or_artifact) # update key from inferred value key = privates.pop("key") # adding "key" here is dangerous because key might be auto-populated attr_to_update = {"description": description} if schema is not None: attr_to_update["schema"] = schema if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None: attr_to_update["key"] = key elif self.key != key and key is not None: if not self.path.exists(): logger.warning(f"updating previous key {self.key} to new key {key}") self.key = key # Keep tracked state aligned with this internal dedup-time key # normalization so save() doesn't treat it as a user key edit. self._original_values["key"] = key assert self.path.exists(), ( # noqa: S101 f"The underlying file for artifact {self} does not exist anymore, clean up the artifact record." ) # noqa: S101 else: logger.warning( f"key {self.key} on existing artifact differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the artifact" ) update_attributes(self, attr_to_update) # an existing artifact might have an imcomplete upload and hence we should # re-populate _local_filepath because this is what triggers the upload set_private_attributes() populate_subsequent_run(self, run) return None else: kwargs = kwargs_or_artifact kwargs["schema"] = schema if revises is None: revises = kwargs_or_artifact.pop("revises") set_private_attributes() if is_automanaged_path and _is_internal_call: kwargs["_key_is_virtual"] = True assert _s().AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101 uid = ( kwargs["key"] .replace(_s().AUTO_KEY_PREFIX, "") .replace(kwargs["suffix"], "") ) kwargs["key"] = user_provided_key if revises is not None: assert uid.startswith(revises.stem_uid) # noqa: S101 if len(uid) == 16: if revises is None: uid += "0000" else: uid, revises = create_uid(revises=revises, version_tag=version_tag) kwargs["uid"] = uid # only set key now so that we don't perform a look-up on it in case revises is passed if revises is not None and revises.key is not None and kwargs["key"] is None: kwargs["key"] = revises.key kwargs["kind"] = kind kwargs["version_tag"] = version_tag kwargs["description"] = description kwargs["branch"] = branch kwargs["space"] = space kwargs["otype"] = otype kwargs["revises"] = revises # this check needs to come down here because key might be populated from an # existing file path during get_artifact_kwargs_from_data() if ( kwargs["key"] is None and kwargs["description"] is None and kwargs["run"] is None ): raise ValueError("Pass one of key, run or description as a parameter") super().__init__(**kwargs) @property def transform(self) -> Transform | None: """Transform whose run created the artifact.""" return self.run.transform if self.run is not None else None @property def overwrite_versions(self) -> bool: """Indicates whether to keep or overwrite versions. It defaults to `False` for file-like artifacts and to `True` for folder-like artifacts. Note that this requires significant storage space for large folders with many duplicated files. Currently, `lamindb` does *not* de-duplicate files across versions as in git, but keeps all files for all versions of the folder in storage. """ return self._overwrite_versions @property def _storage_ongoing(self) -> bool: """Whether the artifact is still in the process of being saved to storage (uploaded for cloud storage). - `True`: write started but not completed - `False`: storage completed or not yet started In the JSON `_aux`field, `True` is represented as `{"so": 1}` and `False` as an absent `"so"` key. """ if self._aux is None: return False if self._aux.get("so") == 1: return True else: return False @_storage_ongoing.setter def _storage_ongoing(self, value: bool | None) -> None: if value is None or value is False: if self._aux is not None and "so" in self._aux: del self._aux["so"] if not self._aux: self._aux = None else: if self._aux is None: self._aux = {} assert value is True self._aux["so"] = 1 @property @deprecated("schemas") def feature_sets(self): return self.schemas @property def path(self) -> UPath: """Path. Example:: import lamindb as ln # File in cloud storage, here AWS S3: artifact = ln.Artifact("s3://my-bucket/my-file.csv").save() artifact.path #S3QueryPath('s3://my-bucket/my-file.csv') # File in local storage: ln.Artifact("./myfile.csv", key="myfile.csv").save() artifact.path #> PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv') """ filepath, _ = _s().filepath_from_artifact(self, using_key=settings._using_key) return filepath @property def _cache_path(self) -> UPath: filepath, cache_key = _s().filepath_cache_key_from_artifact( self, using_key=settings._using_key ) if isinstance(filepath, LocalPathClasses): return filepath return setup_settings.paths.cloud_to_local_no_update( filepath, cache_key=cache_key ) @strict_classmethod def get( cls, idlike: int | str | None = None, *, key: str | None = None, path: AnyPathStr | None = None, is_run_input: bool | Run = False, **expressions, ) -> Artifact: """Get a single artifact. Args: idlike: Either a uid stub, uid or an integer id. key: An optional key to query for. path: An optional full path to query for, including the storage root. is_run_input: Whether to track this artifact as run input. expressions: Other fields and values passed as Django query expressions. Raises: :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found. See Also: - Guide: :doc:`registries` - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get` Examples: :: artifact = ln.Artifact.get("tCUkRcaEjTjhtozp") # gets latest version for family tCUkRcaEjTjhtozp artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0005") # gets version 0005 for family tCUkRcaEjTjhtozp artifact = ln.Artifact.get(key="examples/my_file.parquet") # gets latest version for a key artifact = ln.Artifact.get(key="examples/my_file.parquet", version="2") # pass a version tag artifact = ln.Artifact.get(path="s3://bucket/folder/adata.h5ad") """ if key is not None: expressions["key"] = key if path is not None: expressions["path"] = path return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions) @strict_classmethod def filter( cls, *queries, **expressions, ) -> QuerySet: """Query a set of artifacts. Args: *queries: `Q` expressions. **expressions: Features & fields via the Django query syntax. See Also: - Guide: :doc:`docs:registries` Examples: Query by fields:: ln.Arfifact.filter(key="examples/my_file.parquet") Query by features:: ln.Arfifact.filter(cell_type_by_model__name="T cell") """ # from Registry metaclass return type(cls).filter(cls, *queries, **expressions) @classmethod def from_lazy( cls, suffix: str, overwrite_versions: bool, key: str | None = None, description: str | None = None, run: Run | None = None, **kwargs, ) -> LazyArtifact: """Create a lazy artifact for streaming to auto-generated internal paths. This is needed when it is desirable to stream to a `lamindb` auto-generated internal path and register the path as an artifact. It allows writing directly into the default cloud (or local) storage of the current instance and then saving as an :class:`~lamindb.Artifact`. The lazy artifact object (see :class:`~lamindb.models.LazyArtifact`) creates a real artifact on `.save()` with the provided arguments. Args: suffix: The suffix for the auto-generated internal path overwrite_versions: Whether to overwrite versions. key: An optional key to reference the artifact. description: A description. run: The run that creates the artifact. **kwargs: Other keyword arguments for the artifact to be created. Examples: Local storage: create a lazy artifact, stream to the path, then save:: lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr") zarr.open(lazy.path, mode="w")["test"] = np.array(["test"]) artifact = lazy.save() Cloud storage (e.g. S3): use `zarr.storage.FsspecStore` to stream arrays:: lazy = ln.Artifact.from_lazy(suffix=".zarr", overwrite_versions=True, key="mydata.zarr") store = zarr.storage.FsspecStore.from_url(lazy.path.as_posix()) group = zarr.open(store, mode="w") group["ones"] = np.ones(3) artifact = lazy.save() """ args = {"key": key, "description": description, "run": run, **kwargs} return LazyArtifact(suffix, overwrite_versions, **args) @classmethod def from_dataframe( cls, df: pd.DataFrame | AnyPathStr, *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, schema: Schema | Literal["valid_features"] | None = None, features: dict[str, Any] | None = None, parquet_kwargs: dict[str, Any] | None = None, csv_kwargs: dict[str, Any] | None = None, **kwargs, ) -> Artifact: """Create from `DataFrame`, optionally validate & annotate. Sets `.otype` to `"DataFrame"` and populates `.n_observations`. Args: df: A `DataFrame` object or an `AnyPathStr` pointing to a `DataFrame` in storage, e.g. a `.parquet` or `.csv` file. key: A relative path within default storage, e.g., `"myfolder/myfile.parquet"`. description: A description. revises: An old version of the artifact. run: The run that creates the artifact. schema: A schema that defines how to validate & annotate. features: Additional external features to annotate the artifact via :class:`~lamindb.models.FeatureManager.set_values` (keys can be feature names or `Feature` objects). parquet_kwargs: Additional keyword arguments passed to the `pandas.DataFrame.to_parquet` method, which are passed on to `pyarrow.parquet.ParquetWriter`. csv_kwargs: Additional keyword arguments passed to the `pandas.DataFrame.to_csv` method. Examples: No validation and annotation:: ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet").save() With validation and annotation:: ln.Artifact.from_dataframe(df, key="examples/dataset1.parquet", schema="valid_features").save() Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.valid_features`):: schema = ln.Schema(name="valid_features", itype="Feature").save() External features: .. literalinclude:: scripts/curate_dataframe_external_features.py :language: python Parquet kwargs: .. literalinclude:: scripts/test_artifact_parquet.py :language: python """ if "format" not in kwargs and key is not None and key.endswith(".csv"): kwargs["format"] = ".csv" if schema == "valid_features": from lamindb import examples schema = examples.schemas.valid_features() to_disk_kwargs: dict[str, Any] = parquet_kwargs or csv_kwargs artifact = Artifact( # type: ignore path=df, key=key, run=run, description=description, revises=revises, otype="DataFrame", kind="dataset", to_disk_kwargs=to_disk_kwargs, **kwargs, ) if data_is_dataframe(df): artifact.n_observations = len(df) else: # must be a str or path path = create_path(df) if path.suffix == ".parquet": import pyarrow.parquet as pq with path.open("rb") as f: artifact.n_observations = pq.read_metadata(f).num_rows else: # csv/tsv/others have no metadata and would require a full expensive read artifact.n_observations = None if features is not None: artifact._external_features = features if schema is not None: from lamindb.curators.core import DataFrameCurator if not artifact._state.adding and artifact.suffix != ".parquet": logger.warning( f"not re-validating existing artifact as it was stored as {artifact.suffix}, " "which does not maintain categorical dtype information" ) return artifact curator = DataFrameCurator(artifact, schema, features=features) curator.validate() artifact.schema = schema artifact._curator = curator return artifact @classmethod @deprecated("from_dataframe") def from_df( cls, df: pd.DataFrame, *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, schema: Schema | None = None, **kwargs, ) -> Artifact: return cls.from_dataframe( df, key=key, description=description, run=run, revises=revises, schema=schema, **kwargs, ) @classmethod def from_anndata( cls, adata: Union[AnnData, AnyPathStr], *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, schema: Schema | Literal["ensembl_gene_ids_and_valid_features_in_obs"] | None = None, format: Literal["h5ad", "zarr", "anndata.zarr"] | None = None, h5ad_kwargs: dict[str, Any] | None = None, zarr_kwargs: dict[str, Any] | None = None, **kwargs, ) -> Artifact: """Create from `AnnData`, optionally validate & annotate. Sets `.otype` to `"AnnData"` and populates `.n_observations`. Args: adata: An `AnnData` object or a path of AnnData-like. key: A relative path within default storage, e.g., `"myfolder/myfile.h5ad"`. description: A description. revises: An old version of the artifact. run: The run that creates the artifact. schema: A schema that defines how to validate & annotate. format: Storage format used when writing in-memory `AnnData`. In-memory `AnnData` is first written to cache in this format, then saved to instance storage when calling `.save()`. If `None`, infer from `key` suffix when available, otherwise default to `"h5ad"`. If provided, suffix is formed as `"." + format` (e.g., `"zarr"` -> `".zarr"`). h5ad_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_h5ad` method when writing in-memory `AnnData` to cache. zarr_kwargs: Additional keyword arguments passed to the `anndata.AnnData.write_zarr` method. when writing in-memory `AnnData` to cache. Use `key` with suffix `.zarr` or pass `format="zarr"` for this to work. See Also: :meth:`~lamindb.Collection` Track collections. :class:`~lamindb.Feature` Track features. Example: Write H5AD with custom serialization settings:: ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", h5ad_kwargs={"compression": "gzip"}, ).save() Write Zarr with custom chunking settings:: ln.Artifact.from_anndata( adata, key="examples/dataset1.zarr", format="zarr", zarr_kwargs={"chunks": [1024, 1024]}, ).save() No validation and annotation:: ln.Artifact.from_anndata(adata, key="examples/dataset1.h5ad").save() With validation and annotation:: ln.Artifact.from_anndata(adata, key="examples/dataset1.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs").save() Under-the-hood, this uses the following build-in schema (:func:`~lamindb.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs`): .. literalinclude:: scripts/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py :language: python This schema tranposes the `var` DataFrame during curation, so that one validates and annotates the columns of `var.T`, i.e., `[ENSG00000153563, ENSG00000010610, ENSG00000170458]`. If one doesn't transpose, one would annotate the columns of `var`, i.e., `[gene_symbol, gene_type]`. .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/gLyfToATM7WUzkWW0001.png :width: 800px """ if not data_is_scversedatastructure(adata, "AnnData"): raise ValueError( "data has to be an AnnData object or a path to AnnData-like" ) if schema == "ensembl_gene_ids_and_valid_features_in_obs": from lamindb import examples schema = ( examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs() ) to_disk_kwargs: dict[str, Any] = h5ad_kwargs or zarr_kwargs artifact = Artifact( # type: ignore path=adata, key=key, run=run, description=description, revises=revises, otype="AnnData", kind="dataset", format=format, to_disk_kwargs=to_disk_kwargs, **kwargs, ) # this is done instead of _anndata_n_observations(adata) # because we need a proper path through create_path for cloud paths # for additional upath options etc that create_path adds obj_for_obs: AnnData | UPath if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None: obj_for_obs = artifact._memory_rep else: # returns ._local_filepath for local files # and the proper path through create_path for cloud paths obj_for_obs = artifact.path from ..core.storage._anndata_accessor import _anndata_n_observations artifact.n_observations = _anndata_n_observations(obj_for_obs) if schema is not None: from ..curators import AnnDataCurator curator = AnnDataCurator(artifact, schema) curator.validate() artifact.schema = schema artifact._curator = curator return artifact @classmethod def from_mudata( cls, mdata: Union[MuData, AnyPathStr], *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, schema: Schema | None = None, **kwargs, ) -> Artifact: """Create from `MuData`, optionally validate & annotate. Sets `.otype` to `"MuData"`. Args: mdata: A `MuData` object. key: A relative path within default storage, e.g., `"myfolder/myfile.h5mu"`. description: A description. revises: An old version of the artifact. run: The run that creates the artifact. schema: A schema that defines how to validate & annotate. See Also: :meth:`~lamindb.Collection` Track collections. :class:`~lamindb.Feature` Track features. Example:: import lamindb as ln mdata = ln.examples.datasets.mudata_papalexi21_subset() artifact = ln.Artifact.from_mudata(mdata, key="mudata_papalexi21_subset.h5mu").save() """ if not data_is_scversedatastructure(mdata, "MuData"): raise ValueError("data has to be a MuData object or a path to MuData-like") artifact = Artifact( # type: ignore path=mdata, key=key, run=run, description=description, revises=revises, otype="MuData", kind="dataset", **kwargs, ) if not isinstance(mdata, (str, Path, UPath)): artifact.n_observations = mdata.n_obs if schema is not None: from ..curators import MuDataCurator curator = MuDataCurator(artifact, schema) curator.validate() artifact.schema = schema artifact._curator = curator return artifact @classmethod def from_spatialdata( cls, sdata: SpatialData | AnyPathStr, *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, schema: Schema | None = None, **kwargs, ) -> Artifact: """Create from `SpatialData`, optionally validate & annotate. Sets `.otype` to `"SpatialData"`. Args: sdata: A `SpatialData` object. key: A relative path within default storage, e.g., `"myfolder/myfile.zarr"`. description: A description. revises: An old version of the artifact. run: The run that creates the artifact. schema: A schema that defines how to validate & annotate. See Also: :meth:`~lamindb.Collection` Track collections. :class:`~lamindb.Feature` Track features. Example: No validation and annotation:: import lamindb as ln artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr").save() With validation and annotation. First, find a `SpatialData` schema, e.g.:: ln.Schema.filter(otype="SpatialData").to_dataframe() schema = ln.Schema.get(name="spatialdata_blobs_schema") Then, pass the schema to the `from_spatialdata` method:: artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr", schema=schema).save() You can also define a schema from scratch: .. literalinclude:: scripts/define_schema_spatialdata.py :language: python """ if not data_is_scversedatastructure(sdata, "SpatialData"): raise ValueError( "data has to be a SpatialData object or a path to SpatialData-like" ) artifact = Artifact( # type: ignore path=sdata, key=key, run=run, description=description, revises=revises, otype="SpatialData", kind="dataset", **kwargs, ) # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F # artifact.n_observations = ... if schema is not None: from ..curators import SpatialDataCurator curator = SpatialDataCurator(artifact, schema) curator.validate() artifact.schema = schema artifact._curator = curator return artifact @classmethod def from_tiledbsoma( cls, exp: SOMAExperiment | AnyPathStr, *, key: str | None = None, description: str | None = None, run: Run | None = None, revises: Artifact | None = None, **kwargs, ) -> Artifact: """Create from a `tiledbsoma.Experiment` store. Sets `.otype` to `"tiledbsoma"` and populates `.n_observations`. Args: exp: TileDB-SOMA Experiment object or path to Experiment store. key: A relative path within default storage, e.g., `"myfolder/mystore.tiledbsoma"`. description: A description. revises: An old version of the artifact. run: The run that creates the artifact. Example:: import lamindb as ln artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store").save() """ if not data_is_soma_experiment(exp): raise ValueError( "data has to be a SOMA Experiment object or a path to SOMA Experiment store." ) # SOMAExperiment.uri may have file:// prefix for local paths which needs stripping for filesystem access. # Other URI schemes (s3://, etc.) are preserved and supported. exp = ( exp.uri.removeprefix("file://") if not isinstance(exp, (str, Path, UPath)) else exp ) artifact = Artifact( # type: ignore path=exp, key=key, run=run, description=description, revises=revises, otype="tiledbsoma", kind="dataset", **kwargs, ) from ..core.storage._tiledbsoma import _soma_n_observations artifact.n_observations = _soma_n_observations(artifact.path) return artifact @classmethod def from_dir( cls, path: AnyPathStr, *, key: str | None = None, run: Run | None = None, ) -> SQLRecordList: """Create a list of :class:`~lamindb.Artifact` objects from a directory. Hint: If you have a high number of files (several 100k) and don't want to track them individually, create a single :class:`~lamindb.Artifact` via ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`. Args: path: Source path of folder. key: Key for storage destination. If `None` and directory is in a registered location, the inferred `key` will reflect the relative position. If `None` and directory is outside of a registered storage location, the inferred key defaults to `path.name`. run: A `Run` object. Example:: import lamindb as ln dir_path = ln.examples.datasets.dir_scrnaseq_cellranger("sample_001", ln.settings.storage) ln.Artifact.from_dir(dir_path).save() # creates one artifact per file in dir_path """ folderpath: UPath = create_path(path) # returns Path for local storage = settings.storage.record using_key = settings._using_key storage, use_existing_storage = process_pathlike(folderpath, storage, using_key) folder_key_path: PurePath | Path if key is None: if not use_existing_storage: logger.warning( "folder is outside existing storage location, will copy files from" f" {path} to {storage.root}/{folderpath.name}" ) folder_key_path = Path(folderpath.name) else: # maintain the hierachy within an existing storage location folder_key_path = get_relative_path_to_directory( folderpath, UPath(storage.root) ) else: folder_key_path = Path(key) folder_key = folder_key_path.as_posix() # silence fine-grained logging verbosity = settings.verbosity verbosity_int = settings._verbosity_int if verbosity_int >= 1: settings.verbosity = "warning" artifacts_dict = {} for filepath in folderpath.rglob("*"): if filepath.is_file(): relative_path = get_relative_path_to_directory(filepath, folderpath) artifact_key = folder_key + "/" + relative_path.as_posix() # if creating from rglob, we don't need to check for existence artifact = Artifact( filepath, run=run, key=artifact_key, skip_check_exists=True ) artifacts_dict[artifact.uid] = artifact settings.verbosity = verbosity # run sanity check on hashes hashes = [ artifact.hash for artifact in artifacts_dict.values() if artifact.hash is not None ] uids = artifacts_dict.keys() n_unique_hashes = len(set(hashes)) if n_unique_hashes == len(hashes): artifacts = SQLRecordList(artifacts_dict.values()) else: # consider exact duplicates (same id, same hash) # below can't happen anymore because artifacts is a dict now # if len(set(uids)) == len(set(hashes)): # logger.warning("dropping duplicate records in list of artifact records") # artifacts = list(set(uids)) # consider false duplicates (different id, same hash) if not len(set(uids)) == n_unique_hashes: seen_hashes = set() non_unique_artifacts = { hash: artifact for hash, artifact in artifacts_dict.items() if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore } display_non_unique = "\n ".join( f"{artifact}" for artifact in non_unique_artifacts ) logger.warning( "there are multiple artifact uids with the same hashes, dropping" f" {len(non_unique_artifacts)} duplicates out of" f" {len(artifacts_dict)} artifacts:\n {display_non_unique}" ) artifacts = SQLRecordList( [ artifact for artifact in artifacts_dict.values() if artifact not in non_unique_artifacts.values() ] ) logger.success( f"created {len(artifacts)} artifacts from directory using storage" f" {storage.root} and key = {folder_key}/" ) return artifacts def replace( self, data: Union[AnyPathStr, pd.DataFrame, AnnData, MuData], run: Run | bool | None = None, format: str | None = None, ) -> None: """Replace the artifact content in storage **without** making a new version. **Note:** If you want to create a new version, do **not** use the `.replace()` method but rather any `Artifact` constructor. Args: data: A file path or in-memory dataset object like a `DataFrame`, `AnnData`, `MuData`, or `SpatialData`. run: `Run | bool | None = None` The run that creates the artifact. If `False`, suppress tracking the run. If `None`, infer the run from the global run context. format: `str | None = None` The format of the data to write into storage. If `None`, infer the format from the data. Example: Query a text file and replace its content:: artifact = ln.Artifact.get(key="my_file.txt") artifact.replace("./my_new_file.txt") artifact.save() Note that you need to call `.save()` to persist the changes in storage. """ storage = settings.storage.record run = get_run(run) kwargs, privates = get_artifact_kwargs_from_data( provisional_uid=self.uid, data=data, key=self.key, run=run, format=format, storage=storage, version_tag=None, is_replace=True, ) # this artifact already exists if isinstance(kwargs, Artifact): return kwargs check_path_in_storage = privates["check_path_in_storage"] if check_path_in_storage: err_msg = ( "Can only replace with a local path not in any Storage. " f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}." ) raise ValueError(err_msg) _overwrite_versions = kwargs["_overwrite_versions"] if self._overwrite_versions != _overwrite_versions: err_msg = "It is not allowed to replace " err_msg += "a folder" if self._overwrite_versions else "a file" err_msg += " with " + ("a folder." if _overwrite_versions else "a file.") raise ValueError(err_msg) new_suffix = kwargs["suffix"] if new_suffix != self.suffix: key = self.key real_key = self._real_key if key is not None: new_key = PurePosixPath(key).with_suffix(new_suffix).as_posix() else: new_key = None if (key is not None and not self._key_is_virtual) or real_key is not None: # real_key is not None implies key is not None assert key is not None # noqa: S101 if real_key is not None: self._clear_storagekey = real_key self._real_key = ( PurePosixPath(real_key).with_suffix(new_suffix).as_posix() ) warn_msg = f", _real_key '{real_key}' with '{self._real_key}'" else: self._clear_storagekey = key warn_msg = "" self.key = new_key self._original_values["key"] = new_key logger.warning( f"replacing the file will replace key '{key}' with '{new_key}'{warn_msg}" f" and delete '{self._clear_storagekey}' upon `save()`" ) else: # purely virtual key case self._clear_storagekey = _s().auto_storage_key_from_artifact(self) # might replace None with None, not a big deal self.key = new_key self._original_values["key"] = new_key self.suffix = new_suffix self.size = kwargs["size"] self.hash = kwargs["hash"] self._hash_type = kwargs["_hash_type"] self.run_id = kwargs["run_id"] self.run = kwargs["run"] self.n_files = kwargs["n_files"] self._local_filepath = privates["local_filepath"] self._cloud_filepath = privates["cloud_filepath"] self._memory_rep = privates["memory_rep"] # no need to upload if new file is already in storage self._to_store = not check_path_in_storage # update old suffix with the new one so that the check in artifact save pass # replace() supports changing the suffix self._original_values["suffix"] = self.suffix def open( self, mode: str = "r", engine: Literal["pyarrow", "polars"] = "pyarrow", is_run_input: bool | None = None, **kwargs, ) -> ( PyArrowDataset # PolarsLazyFrame does not implement the context manager protocol hence we need `Iterator` in the type annotation | Iterator[ PolarsLazyFrame ] # note that intersphinx doesn't work for this, hence manual docs link: https://github.com/laminlabs/lamindb/issues/2736#issuecomment-3703889524 | AnnDataAccessor # AnnDataAccessor implements the context manager protocol | SpatialDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | SOMAMeasurement ): """Open a dataset for streaming. Works for the following object types (storage formats): - `DataFrame` (`.parquet`, `.csv`, `.ipc` files or directories with such files) - `AnnData` (`.h5ad`, `.zarr`) - `SpatialData` (`.zarr`) - `tiledbsoma` (`.tiledbsoma`) - generic arrays (`.h5`, `.zarr`) Args: mode: can be `"r"` or `"w"` (write mode) for `tiledbsoma` stores, `"r"` or `"r+"` for `AnnData` or `SpatialData` `zarr` stores, otherwise should be always `"r"` (read-only mode). engine: Which module to use for lazy loading of a dataframe from `pyarrow` or `polars` compatible formats. This has no effect if the artifact is not a dataframe, i.e. if it is an `AnnData,` `hdf5`, `zarr`, `tiledbsoma` object etc. is_run_input: Whether to track this artifact as run input. **kwargs: Keyword arguments for the accessor, i.e. `h5py` or `zarr` connection, `pyarrow.dataset.dataset`, `polars.scan_*` function. Returns: Streaming accessors, in particular, a :class:`pyarrow:pyarrow.dataset.Dataset` object, a context manager yielding a `polars.LazyFrame `__, and objects of type :class:`~lamindb.core.storage.AnnDataAccessor`, :class:`~lamindb.core.storage.SpatialDataAccessor`, :class:`~lamindb.core.storage.BackedAccessor`, :class:`tiledbsoma:tiledbsoma.Collection`, :class:`tiledbsoma.Experiment`, :class:`tiledbsoma.Measurement`. Note: For TileDB-SOMA stores on S3 with federated credentials, credentials are updated only when the storage is opened, not while the store handle is held open. If credentials expire during a long-lived session, close the store and open it again to refresh. Examples: Open a `DataFrame`-like artifact via :class:`pyarrow:pyarrow.dataset.Dataset`:: artifact = ln.Artifact.get(key="sequences/mydataset.parquet") artifact.open() #> pyarrow._dataset.FileSystemDataset Open a `DataFrame`-like artifact via `polars.LazyFrame `__:: artifact = ln.Artifact.get(key="sequences/mydataset.parquet") with artifact.open(engine="polars") as df: # use the `polars.LazyFrame` object similar to a `DataFrame` object Open an `AnnData`-like artifact via :class:`~lamindb.core.storage.AnnDataAccessor`:: import lamindb as ln artifact = ln.Artifact.get(key="scrna/mydataset.h5ad") with artifact.open() as adata: # use the `AnnDataAccessor` similar to an `AnnData` object For more examples and background, see guide: :doc:`/arrays`. """ from ..core.storage._backed_access import _track_writes_factory, backed_access from ..core.storage._polars_lazy_df import POLARS_SUFFIXES from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES if self._overwrite_versions and not self.is_latest: raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG) # all hdf5 suffixes including gzipped h5_suffixes = [".h5", ".hdf5", ".h5ad"] h5_gz_suffixes = [] for s in h5_suffixes: h5_gz_suffixes += [s, s + ".gz", s + ".tar.gz"] # ignore empty suffix for now df_suffixes = tuple(set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES)) suffixes = ( ( "", ".zarr", ".anndata.zarr", ".tiledbsoma", ) + tuple(h5_gz_suffixes) + df_suffixes ) suffix = self.suffix if suffix not in suffixes: raise ValueError( "Artifact should have a zarr, h5, tiledbsoma object" " or a compatible `pyarrow.dataset.dataset` or `polars.scan_*` directory" " as the underlying data, please use one of the following suffixes" f" for the object name: {', '.join(suffixes[1:])}." f" Or no suffix for a folder with {', '.join(df_suffixes)} files" " (no mixing allowed)." ) using_key = settings._using_key filepath, cache_key = _s().filepath_cache_key_from_artifact( self, using_key=using_key ) is_tiledbsoma_w = ( filepath.name == "soma" or suffix == ".tiledbsoma" ) and mode == "w" is_zarr_w = suffix == ".zarr" and mode == "r+" if mode != "r": if not (is_tiledbsoma_w or is_zarr_w): raise ValueError( f"It is not allowed to open a {suffix} object with `mode='{mode}'`. " "You can open all supported formats with `mode='r'`, " "a tiledbsoma store with `mode='w'`, " "AnnData or SpatialData zarr store with `mode='r+'`." ) elif not self.overwrite_versions: raise ValueError( "It is not possible to open artifacts having `overwrite_versions=False` " "in non-read mode (other than `mode='r'`)." ) # consider the case where an object is already locally cached localpath = setup_settings.paths.cloud_to_local_no_update( filepath, cache_key=cache_key ) if is_tiledbsoma_w or is_zarr_w: open_cache = False else: open_cache = not isinstance( filepath, LocalPathClasses ) and not filepath.synchronize_to(localpath, just_check=True) if open_cache: try: access = backed_access( localpath, mode, engine, using_key=using_key, **kwargs ) except Exception as e: # also ignore ValueError here because # such errors most probably just imply an incorrect argument if isinstance(e, (ImportError, ValueError)) or isinstance( filepath, LocalPathClasses ): raise e logger.warning( f"The cache might be corrupted: {e}. Trying to open directly." ) access = backed_access( filepath, mode, engine, using_key=using_key, **kwargs ) # happens only if backed_access has been successful # delete the corrupted cache if localpath.is_dir(): shutil.rmtree(localpath) else: localpath.unlink(missing_ok=True) else: access = backed_access(self, mode, engine, using_key=using_key, **kwargs) if is_tiledbsoma_w: def finalize(): nonlocal self, filepath, localpath if not isinstance(filepath, LocalPathClasses): _, hash, _, _ = get_stat_dir_cloud(filepath) else: # this can be very slow _, hash, _, _ = hash_dir(filepath) if self.hash != hash: from .sqlrecord import init_self_from_db new_version = Artifact( filepath, revises=self, _is_internal_call=True ).save() # note: sets _state.db = "default" init_self_from_db(self, new_version) if localpath != filepath and localpath.exists(): shutil.rmtree(localpath) access = _track_writes_factory(access, finalize) # only call if open is successfull track_run_input(self, is_run_input) return access def load( self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs ) -> ( pd.DataFrame | ScverseDataStructures | dict[str, Any] | list[Any] | AnyPathStr | None ): """Cache artifact in local cache and then load it into memory. See: :mod:`~lamindb.core.loaders`. Args: is_run_input: Whether to track this artifact as run input. mute: Silence logging of caching progress. **kwargs: Keyword arguments for the loader. Examples: Load a `DataFrame`-like artifact:: df = artifact.load() Load an `AnnData`-like artifact:: adata = artifact.load() """ from ..core.loaders import load_to_memory if self._overwrite_versions and not self.is_latest: raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG) if hasattr(self, "_memory_rep") and self._memory_rep is not None: access_memory = self._memory_rep # SpatialData objects zarr stores are moved when saved # SpatialData's __repr__ method attempts to access information from the old path # Therefore, we need to update the in-memory path to the now moved Artifact storage path if access_memory.__class__.__name__ == "SpatialData": access_memory.path = self._cache_path else: filepath, cache_key = _s().filepath_cache_key_from_artifact( self, using_key=settings._using_key ) cache_path = _synchronize_cleanup_on_error( filepath, cache_key=cache_key, print_progress=not mute ) try: # cache_path is local so doesn't trigger any sync in load_to_memory access_memory = load_to_memory(cache_path, **kwargs) except Exception as e: # raise the exception if it comes from not having a correct loader # import error is also most probbaly not a problem with the cache # or if the original path is local if isinstance(e, (NotImplementedError, ImportError)) or isinstance( filepath, LocalPathClasses ): raise e logger.warning( f"The cache might be corrupted: {e}. Retrying to synchronize." ) # delete the existing cache if cache_path.is_dir(): shutil.rmtree(cache_path) else: cache_path.unlink(missing_ok=True) # download again and try to load into memory cache_path = _synchronize_cleanup_on_error( filepath, cache_key=cache_key, print_progress=not mute ) access_memory = load_to_memory(cache_path, **kwargs) # only call if load is successfull track_run_input(self, is_run_input) return access_memory def cache( self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs ) -> UPath: """Download cloud artifact to local cache. Follows synching logic: only caches an artifact if it's outdated in the local cache. Returns a path to a locally cached on-disk object (say a `.jpg` file). Args: mute: Silence logging of caching progress. is_run_input: Whether to track this artifact as run input. Example: Sync the artifact from the cloud and return the local path to the cached file:: artifact.cache() #> PosixPath('/home/runner/work/Caches/lamindb/lamindata/pbmc68k.h5ad') """ if self._overwrite_versions and not self.is_latest: raise ValueError(OUTDATED_ARTIFACT_FILES_OVERWRITTEN_MSG) filepath, cache_key = _s().filepath_cache_key_from_artifact( self, using_key=settings._using_key ) if mute: kwargs["print_progress"] = False cache_path = _synchronize_cleanup_on_error( filepath, cache_key=cache_key, **kwargs ) # only call if sync is successfull track_run_input(self, is_run_input) return cache_path def delete( self, permanent: bool | None = None, storage: bool | None = None, using_key: str | None = None, ) -> None: """Trash or permanently delete. A first call to `.delete()` puts an artifact into the trash (sets `branch_id` to `-1`). A second call permanently deletes the artifact. For an `artifact` that has multiple versions and for which `artifact.overwrite_versions is True`, the default behavior for folders, deleting a non-latest version will not delete the underlying storage unless `storage=True` is passed. Deleting the latest version will delete all versions. Args: permanent: Permanently delete the artifact (skip trash). storage: Indicate whether you want to delete the artifact in storage. Examples: Delete a single file artifact:: import lamindb as ln artifact = ln.Artifact.get(key="some.csv") artifact.delete() # delete a single file artifact Delete an old version of a folder-like artifact:: artifact = ln.Artifact.filter(key="folder.zarr", is_latest=False).first() artiact.delete() # delete an old version, the data will not be deleted Delete all versions of a folder-like artifact:: artifact = ln.Artifact.get(key="folder.zarr". is_latest=True) artifact.delete() # delete all versions, the data will be deleted or prompted for deletion. """ super().delete(permanent=permanent, storage=storage, using_key=using_key) # TODO: consider renaming the transfer argument to sync def save( self, upload: bool | None = None, transfer: Literal["record", "annotations"] = "record", **kwargs, ) -> Artifact: """Save to database & storage. Args: upload: Trigger upload to cloud storage in instances with hybrid storage mode. transfer: In case artifact was queried on a different instance, dictates behavior of sync. If "record", only the artifact record is synced to the current instance. If "annotations", also the annotations linked in the source instance are synced. See Also: :doc:`sync` Example: Save a file-like artifact after creating it with the default constructor `Artifact()`:: import lamindb as ln artifact = ln.Artifact("./myfile.csv", key="myfile.parquet").save() """ if ( not self._state.adding # skip on is_latest change # no need to check if saved because it is checked above and not self._field_changed("is_latest", check_is_saved=False) and not self.is_latest and self.branch_id != -1 # skip on soft deletion ): logger.warning("you are saving to a non-latest version of the artifact") access_token = kwargs.pop("access_token", None) current_instance_uid = setup_settings.instance.uid artifact_storage = self.storage artifact_storage_instance_uid = artifact_storage.instance_uid is_not_artifact_storage_managed_by_current_instance = ( artifact_storage_instance_uid != current_instance_uid ) if self._field_changed("key", check_is_saved=False): new_key = self.key if new_key is None: raise InvalidArgument("Cannot update an artifact key to None.") new_key_suffix = extract_suffix_from_path( PurePosixPath(new_key), arg_name="key" ) if new_key_suffix != self.suffix: raise InvalidArgument( f"The suffix '{new_key_suffix}' of the provided key is incorrect, it should be '{self.suffix}'." ) # Virtual key updates are metadata-only because physical storage keys are # uid-based. if self._key_is_virtual: self._original_values["key"] = new_key else: if self._state.adding: raise InvalidArgument( "Cannot update the key of an artifact before it is saved." ) if is_not_artifact_storage_managed_by_current_instance: raise InvalidArgument( "Cannot update a non-virtual key of an artifact" " in a storage location that is not managed by the current instance." ) old_key = self._original_values["key"] if old_key is None: raise InvalidArgument( "Cannot update a non-virtual artifact key from None." ) if not _handle_non_virtual_key_change_on_save( self, old_key=old_key, new_key=new_key ): return None if self._field_changed("suffix", check_is_saved=False): if self._state.adding: raise InvalidArgument( "Cannot update the suffix of an artifact before it is saved." ) if is_not_artifact_storage_managed_by_current_instance: raise InvalidArgument( "Cannot update the suffix of an artifact" " in a storage location that is not managed by the current instance." ) if not _handle_suffix_change_on_save(self): return None # when space is passed in init, storage is ignored, so space - storage consistency is enforced there if ( self._field_changed("space_id") # here we check for storages managed by any instance # not necessarily with managed credentials # we check if the artifact storage is managed by the current instance further and artifact_storage_instance_uid is not None ): if is_not_artifact_storage_managed_by_current_instance: raise ValueError( "Cannot change the space of an artifact" " in a storage location that is not managed by the current instance." ) space = self.space storage_type = artifact_storage.type storages = Storage.connect(self._state.db).filter( space=space, instance_uid=current_instance_uid, type=storage_type ) n_storages = storages.count() if n_storages == 0: raise ValueError( f"No {storage_type} storage locations managed by the current instance found for the space '{space.name}'." ) elif n_storages > 1: storages = storages.order_by("id") roots_str = "\n".join( f"{i}: {storage.root}" for i, storage in enumerate(storages) ) choice = input( f"Select a storage location of type '{storage_type}' from the target space '{space.name}':" f" \n{roots_str}\n" "Enter the number or 'x' to cancel: " ) if choice == "x": logger.warning("saving was cancelled") return None storage = storages[int(choice)] else: storage = storages.one() if artifact_storage != storage: # try to transfer if both storages are writable / managed by an instance # replaces artifact.storage with the new storage if successful _move_artifact_to_storage(self, storage, access_token=access_token) else: logger.important("artifact is already in the target storage location") # Keep tracked values in sync after handling a space update so # repeated saves don't keep re-running this branch. self._original_values["space_id"] = self.space_id if transfer not in {"record", "annotations"}: raise ValueError( f"transfer should be either 'record' or 'annotations', not {transfer}" ) else: kwargs["transfer"] = transfer state_was_adding = self._state.adding print_progress = kwargs.pop("print_progress", True) store_kwargs = kwargs.pop( "store_kwargs", {} ) # kwargs for .upload_from in the end local_path = None if upload and setup_settings.instance.keep_artifacts_local: # switch local storage location to cloud local_path = self.path self.storage_id = setup_settings.instance.storage._id self._local_filepath = local_path # switch to virtual storage key upon upload # the local filepath is already cached at that point self._key_is_virtual = True # ensure that the artifact is uploaded self._to_store = True local_filepath = getattr(self, "_local_filepath", None) has_local_filepath = local_filepath is not None if has_local_filepath and not local_filepath.exists(): raise FileNotFoundError( f"Unable to save the artifact because the local path {local_filepath} does not exist." ) flag_complete = has_local_filepath and getattr(self, "_to_store", False) if flag_complete: if is_not_artifact_storage_managed_by_current_instance: raise ValueError( "Cannot save an artifact to a storage location that is not managed by the current instance." ) # _storage_ongoing indicates whether the storage saving / upload process is ongoing self._storage_ongoing = True # will be updated to False once complete self._save_skip_storage(**kwargs) using_key = None if "using" in kwargs: using_key = kwargs["using"] exception_upload = check_and_attempt_upload( self, using_key, access_token=access_token, print_progress=print_progress, **store_kwargs, ) if exception_upload is not None: # we do not want to raise file not found on cleanup if upload of a file failed # often it is ACID in the filesystem itself # for example, s3 won't have the failed file, so just skip the delete in this case raise_file_not_found_error = False self._delete_skip_storage() else: # this is the case when it is cleaned on .replace raise_file_not_found_error = True # this is triggered by an exception in check_and_attempt_upload or by replace. exception_clear = check_and_attempt_clearing( self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key, ) if exception_upload is not None: raise exception_upload if exception_clear is not None: raise exception_clear # the saving / upload process has been successful if flag_complete: self._storage_ongoing = False # pass kwargs below because it can contain `using` or other things # affecting the connection super().save(**kwargs) # this is only for keep_artifacts_local if local_path is not None and not state_was_adding: # only move the local artifact to cache if it was not newly created local_path_cache = ln_setup.settings.cache_dir / local_path.name # don't use Path.rename here because of cross-device link error # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969 shutil.move( local_path, # type: ignore local_path_cache, ) logger.important(f"moved local artifact to cache: {local_path_cache}") # annotate with external features if hasattr(self, "_external_features"): external_features = self._external_features self.features.set_values(external_features) # annotate with internal features based on curator if hasattr(self, "_curator"): curator = self._curator del self._curator # just annotates this artifact curator.save_artifact() if hasattr(self, "_external_features"): del self._external_features if hasattr(self, "_local_filepath"): del self._local_filepath return self def _update_artifact_keys_with_suffix(artifact: Artifact, suffix: str): key = artifact.key real_key = artifact._real_key if key is not None: new_key = PurePosixPath(key).with_suffix(suffix).as_posix() artifact.key = new_key if real_key is not None: artifact._real_key = PurePosixPath(real_key).with_suffix(suffix).as_posix() def _confirm_artifact_move(source_path_str: str, target_path_str: str) -> bool: # ask for confirmation # TODO: add a way to disable confirmation response = input( f"You are about to move artifact from '{source_path_str}' to '{target_path_str}'.\n" "Continue? (y/n) " ) if response != "y": logger.warning("saving was cancelled") return False return True def _handle_non_virtual_key_change_on_save( artifact: Artifact, *, old_key: str, new_key: str ) -> bool: # _real_key should actually be None here because it goes with virtual key source_storage_key = ( artifact._real_key if artifact._real_key is not None else old_key ) source_path = artifact.storage.path / source_storage_key # key was updated, so artifact.path is the new path target_path_str = artifact.path.as_posix() source_path_str = source_path.as_posix() if not _confirm_artifact_move(source_path_str, target_path_str): return False _safe_move(source_path.fs, source_path_str, target_path_str) if artifact._real_key is not None: artifact._real_key = new_key # Keep tracked values in sync so repeated saves don't trigger another move. artifact._original_values["key"] = new_key # If key change already applied the suffix transition, skip suffix handling below. artifact._original_values["suffix"] = artifact.suffix return True def _handle_suffix_change_on_save(artifact: Artifact) -> bool: suffix = artifact.suffix # depends on whether key is virtual or real key is present source_or_target_path = artifact.path source_path_str = source_or_target_path.with_suffix( artifact._original_values["suffix"] ).as_posix() target_path_str = source_or_target_path.with_suffix(suffix).as_posix() if not _confirm_artifact_move(source_path_str, target_path_str): return False # source_path and target_path are on the same filesystem _safe_move(source_or_target_path.fs, source_path_str, target_path_str) _update_artifact_keys_with_suffix(artifact, suffix) # Keep tracked values in sync so consecutive suffix updates on the same # in-memory instance trigger a move each time. artifact._original_values["suffix"] = suffix artifact._original_values["key"] = artifact.key return True def _sorted_sizes(fs: AbstractFileSystem, path: str) -> list[int]: objects = fs.find(path, detail=True) return sorted(info["size"] for info in objects.values()) def _rm_catch_error(fs: AbstractFileSystem, path: str) -> Exception | None: if fs.exists(path): try: fs.rm(path, recursive=True) except Exception as rm_exc: return rm_exc return None def _safe_move(fs: AbstractFileSystem, source: str, target: str): if fs.exists(target): raise FileExistsError( f"Cannot move artifact to '{target}' because it already exists." ) logger.important(f"moving artifact from '{source}' to '{target}'") try: fs.copy(source, target, recursive=True) except Exception as e: message = "Failed to copy artifact to target storage during transfer." cleanup_error = _rm_catch_error(fs, target) if cleanup_error is not None: message += f" Cleanup of copied target also failed: {cleanup_error}" raise RuntimeError(message) from e # check that the sizes of the files are the same if _sorted_sizes(fs, source) != _sorted_sizes(fs, target): message = "Move verification failed: copied artifact does not match source." cleanup_error = _rm_catch_error(fs, target) if cleanup_error is not None: message += " Cleanup of copied target also failed." raise RuntimeError(message) from cleanup_error try: fs.rm(source, recursive=True) except Exception as e: logger.error( f"copying to '{target}' succeeded but failed to remove source '{source}': {e}" ) def _move_artifact_to_storage( artifact: Artifact, storage: Storage, access_token: str | None = None ): storage_key = _s().auto_storage_key_from_artifact(artifact) source_path = artifact.path target_path = storage.path / storage_key if source_path == target_path: raise ValueError("Cannot move to the same path.") fs = fs_for_moving(source_path, target_path, access_token=access_token) source_path_str = str(source_path) target_path_str = str(target_path) _safe_move(fs, source_path_str, target_path_str) artifact.storage_id = storage.id # can't really just call .cache in .load because of double tracking def _synchronize_cleanup_on_error( filepath: UPath, cache_key: str | None = None, **kwargs ) -> UPath: try: print_progress = kwargs.pop("print_progress", True) cache_path = setup_settings.paths.cloud_to_local( filepath, cache_key=cache_key, print_progress=print_progress, **kwargs ) except Exception as e: if not isinstance(filepath, LocalPathClasses): cache_path = setup_settings.paths.cloud_to_local_no_update( filepath, cache_key=cache_key ) if cache_path.is_dir(): shutil.rmtree(cache_path) else: cache_path.unlink(missing_ok=True) raise e return cache_path def _delete_skip_storage(artifact, *args, **kwargs) -> None: super(SQLRecord, artifact).delete(*args, **kwargs) def _save_skip_storage(artifact, **kwargs) -> None: save_staged_schemas(artifact) super(Artifact, artifact).save(**kwargs) save_schema_links(artifact) class ArtifactJsonValue(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_jsonvalue") # we follow the lower() case convention rather than snake case for link models jsonvalue: JsonValue = ForeignKey(JsonValue, PROTECT, related_name="links_artifact") class Meta: app_label = "lamindb" unique_together = ("artifact", "jsonvalue") class ArtifactUser(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_user") user: User = ForeignKey(User, PROTECT, related_name="links_artifact") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, related_name="links_artifactuser", default=None ) class Meta: # can have the same label linked to the same artifact if the feature is # different app_label = "lamindb" unique_together = ("artifact", "user", "feature") class ArtifactRun(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_run") # consciously choosing CASCADE run: Run = ForeignKey(Run, CASCADE, related_name="links_artifact") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, related_name="links_artifactrun", default=None ) class Meta: # can have the same label linked to the same artifact if the feature is # different app_label = "lamindb" unique_together = ("artifact", "run", "feature") class ArtifactArtifact(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_artifact") # consciously choosing CASCADE value: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_value") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, related_name="links_artifactartifact", default=None ) class Meta: # can have the same label linked to the same artifact if the feature is # different app_label = "lamindb" unique_together = ("artifact", "value", "feature") def track_run_input( record: ( Artifact | Iterable[Artifact] ), # can also be Collection | Iterable[Collection] is_run_input: bool | Run | None = None, run: Run | None = None, ) -> None: """Links a record as an input to a run. This function contains all validation logic to make decisions on whether a record qualifies as an input or not. """ if is_run_input is False: return None from ..core._context import context from ..core._functions import get_current_tracked_run from .collection import Collection if isinstance(is_run_input, Run): run = is_run_input is_run_input = True elif run is None: run = get_current_tracked_run() if run is None: run = context.run # consider that record is an iterable of Data record_iter: Iterable[Artifact] | Iterable[Collection] = ( [record] if isinstance(record, (Artifact, Collection)) else record ) input_records = [] if run is not None: assert not run._state.adding, "Save the run before tracking its inputs." # noqa: S101 def is_valid_input(record: Artifact | Collection): is_valid = False # if a record is not yet saved it has record._state.db = None # then it can't be an input # we silently ignore because what will happen is that # the record either gets saved and then is tracked as an output # or it won't get saved at all if record._state.db == "default": # things are OK if the record is on the default db is_valid = True else: # record is on another db # we have to save the record into the current db with # the run being attached to a transfer transform logger.info( f"completing transfer to track {record.__class__.__name__}('{record.uid}') as input" ) record.save() is_valid = True # avoid cycles: record can't be both input and output if record.run_id == run.id: logger.debug( f"not tracking {record} as input to run {run} because created by same run" ) is_valid = False if run.id == getattr(record, "_subsequent_run_id", None): logger.debug( f"not tracking {record} as input to run {run} because re-created in same run" ) is_valid = False return is_valid input_records = [record for record in record_iter if is_valid_input(record)] input_records_ids = [record.id for record in input_records] if input_records: record_class_name = input_records[0].__class__.__name__.lower() # let us first look at the case in which the user does not # provide a boolean value for `is_run_input` # hence, we need to determine whether we actually want to # track a run or not track = False is_run_input = settings.track_run_inputs if is_run_input is None else is_run_input if is_run_input: if run is None: isettings = setup_settings.instance if not (isettings._is_clone or isettings.is_read_only_connection): logger.warning(WARNING_NO_INPUT) elif input_records: logger.debug( f"adding {record_class_name} ids {input_records_ids} as inputs for run {run.id}" ) track = True else: track = is_run_input if not track or not input_records: return None if run is None: raise ValueError("No run context set. Call `ln.track()`.") if record_class_name == "artifact": IsLink = run.input_artifacts.through links = [ IsLink(run_id=run.id, artifact_id=record_id) for record_id in input_records_ids ] else: IsLink = run.input_collections.through links = [ IsLink(run_id=run.id, collection_id=record_id) for record_id in input_records_ids ] try: IsLink.objects.bulk_create(links, ignore_conflicts=True) except ProgrammingError as e: if "new row violates row-level security policy" in str(e): instance = setup_settings.instance available_spaces = instance.available_spaces if available_spaces is None: raise NoWriteAccess( f"You’re not allowed to write to the instance {instance.slug}.\n" "Please contact administrators of the instance if you need write access." ) from None write_access_spaces = available_spaces["admin"] + available_spaces["write"] no_write_access_spaces = { record_space for record in input_records if (record_space := record.space) not in write_access_spaces } if (run_space := run.space) not in write_access_spaces: no_write_access_spaces.add(run_space) if not no_write_access_spaces: # if there are no unavailable spaces, then this should be due to locking locked_records = [ record for record in input_records if getattr(record, "is_locked", False) ] if run.is_locked: locked_records.append(run) # if no unavailable spaces and no locked records, just raise the original error if not locked_records: raise e no_write_msg = ( "It is not allowed to modify locked records: " + ", ".join( r.__class__.__name__ + f"(uid={r.uid})" for r in locked_records ) + "." ) raise NoWriteAccess(no_write_msg) from None if len(no_write_access_spaces) > 1: name_msg = ", ".join( f"'{space.name}'" for space in no_write_access_spaces ) space_msg = "spaces" else: name_msg = f"'{no_write_access_spaces.pop().name}'" space_msg = "space" raise NoWriteAccess( f"You’re not allowed to write to the {space_msg} {name_msg}.\n" f"Please contact administrators of the {space_msg} if you need write access." ) from None else: raise e # privates currently dealt with separately # mypy: ignore-errors Artifact._delete_skip_storage = _delete_skip_storage Artifact._save_skip_storage = _save_skip_storage Artifact.view_lineage = view_lineage # PostgreSQL migration helper for _save_completed to _aux["storage_completed"] def migrate_save_completed_to_aux_postgres(schema_editor) -> None: """Migrate _save_completed field to _aux['storage_completed'] using PostgreSQL raw SQL. This migrates _save_completed=False into _aux['storage_completed']=false. _save_completed=True results in no change to _aux (empty JSON is the default). """ schema_editor.execute(""" UPDATE lamindb_artifact SET _aux = CASE WHEN _save_completed = FALSE THEN CASE WHEN _aux IS NULL THEN jsonb_build_object('storage_completed', false) ELSE _aux || jsonb_build_object('storage_completed', false) END ELSE _aux END, _save_completed = NULL WHERE _save_completed IS NOT NULL """) ================================================ FILE: lamindb/models/artifact_set.py ================================================ from __future__ import annotations from collections.abc import Iterable, Iterator from typing import TYPE_CHECKING, Literal from django.db.models import Case, Q, TextField, Value, When from django.db.models.functions import Concat from lamin_utils import logger from lamindb_setup.core._docs import doc_args from upath import UPath from .artifact import Artifact, track_run_input from .collection import Collection, _load_concat_artifacts if TYPE_CHECKING: from anndata import AnnData from lamindb_setup.types import AnyPathStr from pandas import DataFrame from polars import LazyFrame as PolarsLazyFrame from pyarrow.dataset import Dataset as PyArrowDataset from ..core._mapped_collection import MappedCollection UNORDERED_WARNING = ( "this query set is unordered, consider using `.order_by()` first " "to avoid opening the artifacts in an arbitrary order" ) # maybe make this abstract class ArtifactSet(Iterable): """Abstract class representing sets of artifacts returned by queries. This class automatically extends :class:`~lamindb.models.BasicQuerySet` and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`. Examples: >>> artifacts = ln.Artifact.filter(otype="AnnData") >>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet """ @doc_args(Collection.load.__doc__) def load( self, join: Literal["inner", "outer"] = "outer", is_run_input: bool | None = None, **kwargs, ) -> DataFrame | AnnData: """{}""" # noqa: D415 if not self.ordered: # type: ignore logger.warning(UNORDERED_WARNING) artifacts: list[Artifact] = list(self) concat_object = _load_concat_artifacts(artifacts, join, **kwargs) # track only if successful track_run_input(artifacts, is_run_input) return concat_object @doc_args(Collection.open.__doc__) def open( self, engine: Literal["pyarrow", "polars"] = "pyarrow", is_run_input: bool | None = None, **kwargs, ) -> PyArrowDataset | Iterator[PolarsLazyFrame]: """{}""" # noqa: D415 from ..core.storage._backed_access import _open_dataframe if not self.ordered: # type: ignore logger.warning(UNORDERED_WARNING) artifacts: list[Artifact] = list(self) paths: list[UPath] = [artifact.path for artifact in artifacts] dataframe = _open_dataframe(paths, engine=engine, **kwargs) # track only if successful track_run_input(artifacts, is_run_input) return dataframe @doc_args(Collection.mapped.__doc__) def mapped( self, layers_keys: str | list[str] | None = None, obs_keys: str | list[str] | None = None, obsm_keys: str | list[str] | None = None, obs_filter: dict[str, str | list[str]] | None = None, join: Literal["inner", "outer"] | None = "inner", encode_labels: bool | list[str] = True, unknown_label: str | dict[str, str] | None = None, cache_categories: bool = True, parallel: bool = False, dtype: str | None = None, stream: bool = False, is_run_input: bool | None = None, ) -> MappedCollection: """{}""" # noqa: D415 from ..core._mapped_collection import MappedCollection if not self.ordered: # type: ignore logger.warning(UNORDERED_WARNING) artifacts: list[Artifact] = [] paths: list[UPath] = [] for artifact in self: if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix: logger.warning(f"ignoring artifact with suffix {artifact.suffix}") continue elif not stream: paths.append(artifact.cache()) else: paths.append(artifact.path) artifacts.append(artifact) ds = MappedCollection( paths, layers_keys, obs_keys, obsm_keys, obs_filter, join, encode_labels, unknown_label, cache_categories, parallel, dtype, ) # track only if successful track_run_input(artifacts, is_run_input) return ds def artifacts_from_path(artifacts: ArtifactSet, path: AnyPathStr) -> ArtifactSet: """Returns artifacts in the query set that are registered for the provided path.""" from lamindb.models import BasicQuerySet, QuerySet # not QuerySet but only BasicQuerySet assert isinstance(artifacts, BasicQuerySet) and not isinstance(artifacts, QuerySet) # noqa: S101 upath = UPath(path) path_str = upath.as_posix() stem = upath.stem stem_len = len(stem) if stem_len == 16: qs = artifacts.filter( Q(_key_is_virtual=True) | Q(key__isnull=True), _real_key__isnull=True, uid__startswith=stem, ) elif stem_len == 20: qs = artifacts.filter( Q(_key_is_virtual=True) | Q(key__isnull=True), _real_key__isnull=True, uid=stem, ) else: qs = None if qs: # an empty query set evaluates to False return qs qs = ( artifacts.filter(Q(_key_is_virtual=False) | Q(_real_key__isnull=False)) .alias( db_path=Case( When( _real_key__isnull=False, then=Concat( "storage__root", Value("/"), "_real_key", output_field=TextField(), ), ), default=Concat( "storage__root", Value("/"), "key", output_field=TextField() ), output_field=TextField(), ) ) .filter(db_path=path_str) ) return qs ================================================ FILE: lamindb/models/block.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any, Literal, get_args, overload from django.db import models from django.db.models import ( CASCADE, PROTECT, CharField, DateTimeField, ForeignKey, JSONField, Q, TextField, ) from lamin_utils import logger from lamindb_setup.core.hashing import hash_string from ..base.types import RegistryId from ..base.uids import base62_16 from ._is_versioned import create_uid, process_revises from .artifact import Artifact from .collection import Collection from .feature import Feature from .project import Project from .record import Record from .run import Run, User, current_user_id from .schema import Schema from .sqlrecord import ( BaseSQLRecord, Branch, IsVersioned, Space, SQLRecord, init_self_from_db, update_attributes, ) from .transform import Transform if TYPE_CHECKING: from datetime import datetime from .query_manager import RelatedManager _VERSIONED_ATTACHED_KINDS = ("readme",) # only readme is versioned; comment is not _VALID_BLOCK_KINDS: tuple[str, ...] = ("readme", "comment") _BLOCK_ALLOWED_NON_REGISTRY_KEYS: tuple[str, ...] = ("README.md",) def _init_versioned_attached_block( self: BaseBlock, fk_field_name: str, *args: Any, allowed_extra: tuple[str, ...] = (), **kwargs: Any, ) -> None: cls = type(self) if len(args) == len(self._meta.concrete_fields): super(cls, self).__init__(*args, **kwargs) return None if args: raise ValueError( f"Please only use keyword arguments to construct a {cls.__name__}" ) fk_value = kwargs.pop(fk_field_name, None) content = kwargs.pop("content", None) kind = kwargs.pop("kind", None) version_tag = kwargs.pop("version_tag", kwargs.pop("version", None)) revises = kwargs.pop("revises", None) using = kwargs.pop("using", None) uid = kwargs.pop("uid", None) if "uid" in kwargs else None default_allowed_extra = ("branch", "branch_id", "created_on", "created_on_id") all_allowed_extra = default_allowed_extra + allowed_extra extra_kwargs = {k: kwargs.pop(k) for k in all_allowed_extra if k in kwargs} allowed = { fk_field_name, "content", "kind", "version", "version_tag", "revises", "using", "uid", *all_allowed_extra, } if kwargs: raise ValueError( f"Only {', '.join(sorted(allowed))} can be passed, but you passed: {kwargs}" ) if fk_value is None: raise ValueError(f"{fk_field_name} is required for {cls.__name__}") if kind is None: raise ValueError( f"kind is required for {cls.__name__}; use 'readme' or 'comment'" ) if kind not in _VALID_BLOCK_KINDS: raise ValueError(f"kind must be 'readme' or 'comment', got {kind!r}") if kind == "comment": if revises is not None: raise ValueError( "revises is not allowed for kind='comment'; comments are not versioned" ) new_uid, _ = create_uid( revises=None, version_tag=version_tag, n_full_id=cls._len_full_uid, ) block_hash = hash_string(content) if content else None super(cls, self).__init__( uid=new_uid, content=content or "", hash=block_hash, kind=kind, version_tag=version_tag, revises=None, **{fk_field_name: fk_value}, **extra_kwargs, ) return None # kind == "readme" (versioned) if revises is None and fk_value is not None: candidate_for_revises = ( cls.objects.using(using) .filter( **{fk_field_name: fk_value}, kind=kind, is_latest=True, ) .order_by("-created_at") .first() ) if candidate_for_revises is not None: revises = candidate_for_revises content_blank = getattr(revises, "content", None) in (None, "") if content_blank: logger.important( "no content was yet saved, returning existing " f"block with same {fk_field_name} and kind" ) uid = revises.uid if revises is not None and uid is not None and uid == revises.uid: init_self_from_db(self, revises) update_attributes(self, {}) return None new_uid, revises = create_uid( revises=revises, version_tag=version_tag, n_full_id=cls._len_full_uid, ) if uid is None: uid = new_uid block_hash = hash_string(content) if content else None super(cls, self).__init__( uid=uid, content=content or "", hash=block_hash, kind=kind, version_tag=version_tag, revises=revises, **{fk_field_name: fk_value}, **extra_kwargs, ) class BaseBlock(IsVersioned): class Meta: abstract = True _len_full_uid: int = 20 _len_stem_uid: int = 16 id = models.BigAutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=_len_full_uid, default=base62_16, ) """Universal id.""" content: str = TextField() """Content of the block.""" hash: str = CharField(max_length=22, db_index=True, null=True) """Content hash of the block.""" kind: str = CharField( max_length=22, db_index=True, default="readme", db_default="readme" ) """The kind of block.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) """Creator of block.""" _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True) """Status code.""" _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True) """Auxiliary field for dictionary-like metadata.""" class Block(BaseBlock, SQLRecord): """An experimental markdown block for anything: issues, standalone markdown pages, comments, etc. The `Block` model is experimental and may change in the future. """ class Meta: app_label = "lamindb" # same key as in transform/artifact/collection key: str | None = CharField(max_length=1024, db_index=True, null=True) """The key for which we want to create a block.""" anchor: Block | None = ForeignKey( "Block", PROTECT, related_name="children", null=True ) """The anchor of this block. For a comment, could be the issue on which the comment is attached. For a sub-post, could be the parent post. """ projects: RelatedManager[Project] """Projects that annotate this block.""" anchors: RelatedManager[Block] """This block anchors these blocks.""" @overload def __init__( self, key: str | None = None, content: str | None = None, kind: Literal["readme"] = ..., version: str | None = None, revises: Block | None = None, anchor: Block | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None if args: raise ValueError("Please only use keyword arguments to construct a Block") key = kwargs.pop("key", None) content = kwargs.pop("content", None) revises = kwargs.pop("revises", None) version_tag = kwargs.pop("version_tag", kwargs.pop("version", None)) kind = kwargs.pop("kind", None) anchor = kwargs.pop("anchor", None) using = kwargs.pop("using", None) uid = kwargs.pop("uid", None) if "uid" in kwargs else None branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) if kwargs: raise ValueError( "Only key, content, kind, version, revises, anchor " f"can be passed, but you passed: {kwargs}" ) if kind != "readme": raise ValueError("Only kind = 'readme' is supported for block.") _registry_ids = get_args(RegistryId) allowed_keys = set(_registry_ids).union(_BLOCK_ALLOWED_NON_REGISTRY_KEYS) if key is not None and key not in allowed_keys: raise ValueError( "key must be one of RegistryId or " f"{', '.join(_BLOCK_ALLOWED_NON_REGISTRY_KEYS)}: " f"{', '.join(_registry_ids)}" ) if revises is not None and not isinstance(revises, Block): raise TypeError("`revises` has to be of type `Block`") if revises is None: if uid is not None: revises = ( Block.objects.using(using) .filter( uid__startswith=uid[:-4], is_latest=True, ) .order_by("-created_at") .first() ) elif key is not None: candidate_for_revises = ( Block.objects.using(using) .filter( ~Q(branch_id=-1), key=key, is_latest=True, ) .order_by("-created_at") .first() ) if candidate_for_revises is not None: revises = candidate_for_revises content_blank = getattr(candidate_for_revises, "content", None) in ( None, "", ) if content_blank: logger.important( "no content was yet saved, returning existing " "block with same key" ) uid = revises.uid if revises is not None and uid is not None and uid == revises.uid: if revises.key != key: logger.warning("ignoring inconsistent key") init_self_from_db(self, revises) update_attributes(self, {}) return None if revises is not None and key is not None and revises.key != key: logger.important(f"renaming block {revises.key} to {key}") new_uid, version_tag, key, _, revises = process_revises( revises, version_tag, key, None, Block ) if uid is None: uid = new_uid block_hash = None if content is not None: block_hash = hash_string(content) block_candidate = Block.objects.filter( ~Q(branch_id=-1), hash=block_hash, is_latest=True, ).first() if block_candidate is not None: init_self_from_db(self, block_candidate) update_attributes(self, {}) if key is not None and block_candidate.key != key: logger.warning( f"key {self.key} on existing block differs from " f"passed key {key}, keeping original key" ) return None super().__init__( uid=uid, key=key, content=content or "", kind=kind, version_tag=version_tag, hash=block_hash, revises=revises, anchor=anchor, branch=branch, branch_id=branch_id, space=space, space_id=space_id, ) class HasBranch(models.Model): class Meta: abstract = True branch: Branch = ForeignKey( Branch, PROTECT, default=1, db_default=1, related_name="+", ) """The current branch of the object - changes e.g. on merge events.""" created_on: Branch = ForeignKey( Branch, PROTECT, default=1, db_default=1, related_name="+", ) """The branch on which this object was created - never changes.""" class RecordBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a record.""" class Meta: app_label = "lamindb" record: Record = ForeignKey(Record, CASCADE, related_name="ablocks") """The record to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "record", *args, **kwargs) class ArtifactBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to an artifact.""" class Meta: app_label = "lamindb" artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="ablocks") """The artifact to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "artifact", *args, **kwargs) class TransformBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a transform.""" class Meta: app_label = "lamindb" transform: Transform = ForeignKey( Transform, CASCADE, related_name="ablocks", null=True ) """The transform to which the block is attached.""" line_number: int | None = models.IntegerField(null=True) """The line number in the source code to which the block belongs.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block( self, "transform", *args, allowed_extra=("line_number",), **kwargs ) class RunBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a run.""" class Meta: app_label = "lamindb" run: Run = ForeignKey(Run, CASCADE, related_name="ablocks") """The run to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "run", *args, **kwargs) class CollectionBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a collection.""" class Meta: app_label = "lamindb" collection: Collection = ForeignKey( Collection, CASCADE, related_name="ablocks", null=True ) """The collection to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "collection", *args, **kwargs) class SchemaBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a schema.""" class Meta: app_label = "lamindb" schema: Schema = ForeignKey(Schema, CASCADE, related_name="ablocks") """The schema to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "schema", *args, **kwargs) class FeatureBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a feature.""" class Meta: app_label = "lamindb" feature: Feature = ForeignKey(Feature, CASCADE, related_name="ablocks") """The feature to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "feature", *args, **kwargs) class ProjectBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a project.""" class Meta: app_label = "lamindb" project: Project = ForeignKey(Project, CASCADE, related_name="ablocks") """The project to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "project", *args, **kwargs) class SpaceBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a space.""" class Meta: app_label = "lamindb" space: Space = ForeignKey(Space, CASCADE, related_name="ablocks") """The space to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "space", *args, **kwargs) class ULabelBlock(BaseBlock, BaseSQLRecord, HasBranch): """An unstructured notes block that can be attached to a ulabel.""" class Meta: app_label = "lamindb" ulabel = ForeignKey("ULabel", CASCADE, related_name="ablocks") """The ulabel to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "ulabel", *args, **kwargs) class BranchBlock(BaseBlock, BaseSQLRecord): """An unstructured notes block that can be attached to a branch.""" class Meta: app_label = "lamindb" branch: Branch = ForeignKey(Branch, CASCADE, related_name="ablocks") """The branch to which the block is attached.""" def __init__(self, *args, **kwargs): _init_versioned_attached_block(self, "branch", *args, **kwargs) ================================================ FILE: lamindb/models/can_curate.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Iterable, Literal, Union import numpy as np from django.core.exceptions import FieldDoesNotExist from django.db.models import Manager, QuerySet from lamin_utils import colors, logger from lamindb.base.utils import strict_classmethod from ..errors import ValidationError from ._from_values import ( _format_values, _from_values, get_organism_record_from_field, ) from .sqlrecord import SQLRecord, get_name_field if TYPE_CHECKING: from lamin_utils._inspect import InspectResult from pandas import DataFrame from lamindb.base.types import ListLike, StrField from .query_set import SQLRecordList def _check_if_record_in_db(record: str | SQLRecord | None, using_key: str | None): """Check if the record is from the using_key DB.""" if isinstance(record, SQLRecord): if using_key is not None and using_key != "default": if record._state.db != using_key: raise ValueError( f"record must be a {record.__class__.__get_name_with_module__()} record from instance '{using_key}'!" ) def _concat_lists(values: ListLike | str) -> list[str]: """Concatenate a list of lists of strings into a single list.""" import pandas as pd if isinstance(values, str): values = [values] if isinstance(values, (list, pd.Series)) and len(values) > 0: first_item = values[0] if isinstance(values, list) else values.iloc[0] if isinstance(first_item, list): if isinstance(values, pd.Series): values = values.tolist() values = [ v for sublist in values if isinstance(sublist, list) for v in sublist ] return values def _inspect( cls, values: ListLike, field: StrField | None = None, *, mute: bool = False, organism: str | SQLRecord | None = None, source: SQLRecord | None = None, from_source: bool = True, strict_source: bool = False, ) -> DataFrame | dict[str, list[str]]: """{}""" # noqa: D415 from lamin_utils._inspect import inspect values = _concat_lists(values) field_str = get_name_field(cls, field=field) queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all() registry = queryset.model model_name = registry._meta.model.__name__ if isinstance(source, SQLRecord): _check_if_record_in_db(source, queryset.db) # if strict_source mode, restrict the query to the passed ontology source # otherwise, inspect across records present in the DB from all ontology sources and no-source if strict_source: queryset = queryset.filter(source=source) organism_record = get_organism_record_from_field( getattr(registry, field_str), organism, values, queryset.db ) _check_if_record_in_db(organism_record, queryset.db) # do not inspect synonyms if the field is not name field standardize = True if hasattr(registry, "_name_field") and field_str != registry._name_field: standardize = False # inspect in the DB result_db = inspect( df=_filter_queryset_with_organism(queryset=queryset, organism=organism_record), identifiers=values, field=field_str, standardize=standardize, mute=mute, ) nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys()) if from_source and len(nonval) > 0 and hasattr(registry, "source_id"): try: public_result = registry.public( organism=organism_record, source=source ).inspect( values=nonval, field=field_str, mute=True, standardize=standardize, ) public_validated = public_result.validated public_mapper = public_result.synonyms_mapper hint = False if len(public_validated) > 0 and not mute: print_values = _format_values(public_validated) s = "" if len(public_validated) == 1 else "s" labels = colors.yellow(f"{len(public_validated)} {model_name} term{s}") logger.print( f" detected {labels} in public source for" f" {colors.italic(field_str)}: {colors.yellow(print_values)}" ) hint = True if len(public_mapper) > 0 and not mute: print_values = _format_values(list(public_mapper.keys())) s = "" if len(public_mapper) == 1 else "s" labels = colors.yellow(f"{len(public_mapper)} {model_name} term{s}") logger.print( f" detected {labels} in public source as {colors.italic(f'synonym{s}')}:" f" {colors.yellow(print_values)}" ) hint = True if hint: logger.print( f"→ add records from public source to your {model_name} registry via" f" {colors.italic('.from_values()')}" ) nonval = [i for i in public_result.non_validated if i not in public_mapper] # type: ignore # no public source is found except ValueError: logger.warning("no public source found, skipping source validation") if len(nonval) > 0 and not mute: print_values = _format_values(list(nonval)) s = "" if len(nonval) == 1 else "s" labels = colors.red(f"{len(nonval)} term{s}") logger.print(f" couldn't validate {labels}: {colors.red(print_values)}") logger.print( f"→ if you are sure, create new record{s} via" f" {colors.italic(f'{registry.__name__}()')} and save to your registry" ) return result_db def _validate( cls, values: ListLike, field: StrField | None = None, *, mute: bool = False, organism: str | SQLRecord | None = None, source: SQLRecord | None = None, strict_source: bool = False, ) -> np.ndarray: """{}""" # noqa: D415 import pandas as pd from lamin_utils._inspect import validate return_str = True if isinstance(values, str) else False values = _concat_lists(values) field_str = get_name_field(cls, field=field) queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all() registry = queryset.model if isinstance(source, SQLRecord): _check_if_record_in_db(source, queryset.db) if strict_source: queryset = queryset.filter(source=source) organism_record = get_organism_record_from_field( getattr(registry, field_str), organism, values, queryset.db ) _check_if_record_in_db(organism_record, queryset.db) field_values = pd.Series( _filter_queryset_with_organism( queryset=queryset, organism=organism_record, values_list_field=field_str, ), dtype="object", ) if field_values.empty: if not mute: msg = f"Your {queryset.model.__name__} registry is empty, consider populating it first!" if hasattr(queryset.model, "source_id"): msg += "\n → use `.import_source()` to import records from a source, e.g. a public ontology" logger.warning(msg) return np.array([False] * len(values)) result = validate( identifiers=values, field_values=field_values, case_sensitive=True, mute=mute, field=field_str, ) if return_str and len(result) == 1: return result[0] else: return result def _standardize( cls, values: ListLike, field: StrField | None = None, *, return_field: str = None, return_mapper: bool = False, case_sensitive: bool = False, mute: bool = False, from_source: bool = True, keep: Literal["first", "last", False] = "first", synonyms_field: str = "synonyms", organism: str | SQLRecord | None = None, source: SQLRecord | None = None, strict_source: bool = False, ) -> list[str] | dict[str, str]: """{}""" # noqa: D415 import pandas as pd from lamin_utils._standardize import standardize as map_synonyms return_str = True if isinstance(values, str) else False values = _concat_lists(values) field_str = get_name_field(cls, field=field) return_field_str = get_name_field( cls, field=field if return_field is None else return_field ) queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.filter().all() registry = queryset.model if isinstance(source, SQLRecord): _check_if_record_in_db(source, queryset.db) if strict_source: queryset = queryset.filter(source=source) organism_record = get_organism_record_from_field( getattr(registry, field_str), organism, values, queryset.db ) _check_if_record_in_db(organism_record, queryset.db) # only perform synonym mapping if field is the name field if hasattr(registry, "_name_field") and field_str != registry._name_field: synonyms_field = None try: registry._meta.get_field(synonyms_field) fields = { field_name for field_name in [field_str, return_field_str, synonyms_field] if field_name is not None } df = _filter_queryset_with_organism( queryset=queryset, organism=organism_record, values_list_fields=list(fields), ) except FieldDoesNotExist: df = pd.DataFrame() _kwargs = { "field": field_str, "return_field": return_field_str, "case_sensitive": case_sensitive, "keep": keep, "synonyms_field": synonyms_field, } # standardized names from the DB std_names_db = map_synonyms( df=df, identifiers=values, return_mapper=return_mapper, mute=mute, **_kwargs, ) def _return(result: list, mapper: dict): if return_mapper: return mapper else: if return_str and len(result) == 1: return result[0] return result # map synonyms in public source if hasattr(registry, "source_id") and from_source: mapper = {} if return_mapper: mapper = std_names_db std_names_db = map_synonyms( df=df, identifiers=values, return_mapper=False, mute=True, **_kwargs ) val_res = registry.validate( std_names_db, field=field, mute=True, organism=organism_record ) if all(val_res): return _return(result=std_names_db, mapper=mapper) nonval = np.array(std_names_db)[~val_res] std_names_bt_mapper = registry.public( organism=organism_record, source=source ).standardize(nonval, return_mapper=True, mute=True, **_kwargs) if len(std_names_bt_mapper) > 0 and not mute: s = "" if len(std_names_bt_mapper) == 1 else "s" field_print = "synonym" if field_str == return_field_str else field_str reduced_mapped_keys_str = f"{list(std_names_bt_mapper.keys())[:10] + ['...'] if len(std_names_bt_mapper) > 10 else list(std_names_bt_mapper.keys())}" truncated_note = ( " (output truncated)" if len(std_names_bt_mapper) > 10 else "" ) warn_msg = ( f"found {len(std_names_bt_mapper)} {field_print}{s} in public source{truncated_note}:" f" {reduced_mapped_keys_str}\n" f" please add corresponding {registry._meta.model.__name__} records via{truncated_note}:" f" `.from_values({reduced_mapped_keys_str})`" ) logger.warning(warn_msg) mapper.update(std_names_bt_mapper) if hasattr(std_names_db, "dtype") and isinstance( std_names_db.dtype, pd.CategoricalDtype ): result = std_names_db.cat.rename_categories(std_names_bt_mapper).tolist() else: result = pd.Series(std_names_db).replace(std_names_bt_mapper).tolist() return _return(result=result, mapper=mapper) else: return _return(result=std_names_db, mapper=std_names_db) def _add_or_remove_synonyms( synonym: str | ListLike, record: CanCurate, action: Literal["add", "remove"], force: bool = False, save: bool | None = None, ): """Add or remove synonyms.""" def check_synonyms_in_all_records(synonyms: set[str], record: CanCurate): """Errors if input synonym is associated with other records in the DB.""" import pandas as pd from IPython.display import display syns_all = ( record.__class__.filter().exclude(synonyms="").exclude(synonyms=None) # type: ignore ) if len(syns_all) == 0: return df = pd.DataFrame(syns_all.values()) df["synonyms"] = df["synonyms"].str.split("|") df = df.explode("synonyms") matches_df = df[(df["synonyms"].isin(synonyms)) & (df["id"] != record.id)] # type: ignore if matches_df.shape[0] > 0: records_df = pd.DataFrame(syns_all.filter(id__in=matches_df["id"]).values()) logger.error( f"input synonyms {matches_df['synonyms'].unique()} already associated" " with the following records:\n" ) display(records_df) raise ValidationError( f"you are trying to assign a synonym to record: {record}\n" " → consider removing the synonym from existing records or using a different synonym." ) # passed synonyms # nothing happens when passing an empty string or list if isinstance(synonym, str): if len(synonym) == 0: return syn_new_set = {synonym} else: if synonym == [""]: return syn_new_set = set(synonym) # nothing happens when passing an empty string or list if len(syn_new_set) == 0: return # because we use | as the separator if any("|" in i for i in syn_new_set): raise ValidationError("a synonym can't contain '|'!") # existing synonyms syns_exist = record.synonyms # type: ignore if syns_exist is None or len(syns_exist) == 0: syns_exist_set = set() else: syns_exist_set = set(syns_exist.split("|")) if action == "add": if not force: check_synonyms_in_all_records(syn_new_set, record) syns_exist_set.update(syn_new_set) elif action == "remove": syns_exist_set = syns_exist_set.difference(syn_new_set) if len(syns_exist_set) == 0: syns_str = None else: syns_str = "|".join(syns_exist_set) record.synonyms = syns_str # type: ignore if save is None: # if record is already in DB, save the changes to DB save = not record._state.adding # type: ignore if save: record.save() # type: ignore def _check_synonyms_field_exist(record: CanCurate): """Check if synonyms field exists.""" if not hasattr(record, "synonyms"): raise NotImplementedError( f"No synonyms field found in table {record.__class__.__name__}!" ) from None def _filter_queryset_with_organism( queryset: QuerySet, organism: SQLRecord | None = None, values_list_field: str | None = None, values_list_fields: list[str] | None = None, ): """Filter a queryset based on organism.""" import pandas as pd if organism is not None: queryset = queryset.filter(organism=organism) # values_list_field/s for better performance if values_list_field is None: if values_list_fields: return pd.DataFrame.from_records( queryset.values_list(*values_list_fields), columns=values_list_fields ) return pd.DataFrame.from_records(queryset.values()) else: return queryset.values_list(values_list_field, flat=True) class CanCurate: """Base class providing :class:`~lamindb.models.SQLRecord`-based validation.""" @strict_classmethod def inspect( cls, values: ListLike, field: StrField | None = None, *, mute: bool = False, organism: Union[str, SQLRecord, None] = None, source: SQLRecord | None = None, from_source: bool = True, strict_source: bool = False, ) -> InspectResult: """Inspect if values are mappable to a field. Being mappable means that an exact match exists. Args: values: Values that will be checked against the field. field: The field of values. Examples are `'ontology_id'` to map against the source ID or `'name'` to map against the ontologies field names. mute: Whether to mute logging. organism: An Organism name or record. source: A `bionty.Source` record that specifies the version to inspect against. strict_source: Determines the validation behavior against records in the registry. - If `False`, validation will include all records in the registry, ignoring the specified source. - If `True`, validation will only include records in the registry that are linked to the specified source. Note: this parameter won't affect validation against public sources. See Also: :meth:`~lamindb.models.CanCurate.validate` Example:: import bionty as bt # save some gene records bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save() # inspect gene symbols gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"] result = bt.Gene.inspect(gene_symbols, field=bt.Gene.symbol, organism="human") assert result.validated == ["A1CF", "A1BG"] assert result.non_validated == ["FANCD1", "FANCD20"] """ return _inspect( cls=cls, values=values, field=field, mute=mute, strict_source=strict_source, organism=organism, source=source, from_source=from_source, ) @strict_classmethod def validate( cls, values: ListLike, field: StrField | None = None, *, mute: bool = False, organism: Union[str, SQLRecord, None] = None, source: SQLRecord | None = None, strict_source: bool = False, ) -> np.ndarray: """Validate values against existing values of a string field. Note this is strict_source validation, only asserts exact matches. Args: values: Values that will be validated against the field. field: The field of values. Examples are `'ontology_id'` to map against the source ID or `'name'` to map against the ontologies field names. mute: Whether to mute logging. organism: An Organism name or record. source: A `bionty.Source` record that specifies the version to validate against. strict_source: Determines the validation behavior against records in the registry. - If `False`, validation will include all records in the registry, ignoring the specified source. - If `True`, validation will only include records in the registry that are linked to the specified source. Note: this parameter won't affect validation against public sources. Returns: A vector of booleans indicating if an element is validated. See Also: :meth:`~lamindb.models.CanCurate.inspect` Example:: import bionty as bt bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save() gene_symbols = ["A1CF", "A1BG", "FANCD1", "FANCD20"] bt.Gene.validate(gene_symbols, field=bt.Gene.symbol, organism="human") #> array([ True, True, False, False]) """ return _validate( cls=cls, values=values, field=field, mute=mute, strict_source=strict_source, organism=organism, source=source, ) @strict_classmethod def from_values( cls, values: ListLike, field: StrField | None = None, create: bool = False, organism: Union[SQLRecord, str, None] = None, source: SQLRecord | None = None, standardize: bool = True, from_source: bool = True, mute: bool = False, ) -> SQLRecordList: """Bulk create validated records by parsing values for an identifier such as a name or an id). Args: values: A list of values for an identifier, e.g. `["name1", "name2"]`. field: A `SQLRecord` field to look up, e.g., `bt.CellMarker.name`. create: Whether to create records if they don't exist. organism: A `bionty.Organism` name or record. source: A `bionty.Source` record to validate against to create records for. standardize: Whether to standardize synonyms in the values. from_source: Whether to create records from public source. mute: Whether to mute logging. Returns: A list of validated records. For bionty registries. Also returns knowledge-coupled records. Notes: For more info, see tutorial: :doc:`docs:manage-ontologies`. Example:: import bionty as bt # Bulk create from non-validated values will log warnings & returns empty list ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"]) assert len(ulabels) == 0 # Bulk create records from validated values returns the corresponding existing records ulabels = ln.ULabel.from_values(["benchmark", "prediction", "test"], create=True).save() assert len(ulabels) == 3 # Bulk create records from public reference bt.CellType.from_values(["T cell", "B cell"]).save() """ return _from_values( iterable=values, field=getattr(cls, get_name_field(cls, field=field)), create=create, organism=organism, source=source, mute=mute, ) @strict_classmethod def standardize( cls, values: Iterable, field: StrField | None = None, *, return_field: StrField | None = None, return_mapper: bool = False, case_sensitive: bool = False, mute: bool = False, from_source: bool = True, keep: Literal["first", "last", False] = "first", synonyms_field: str = "synonyms", organism: Union[str, SQLRecord, None] = None, source: SQLRecord | None = None, strict_source: bool = False, ) -> list[str] | dict[str, str]: """Maps input synonyms to standardized names. Args: values: Identifiers that will be standardized. field: The field representing the standardized names. return_field: The field to return. Defaults to field. return_mapper: If `True`, returns `{input_value: standardized_name}`. case_sensitive: Whether the mapping is case sensitive. mute: Whether to mute logging. from_source: Whether to standardize from public source. Defaults to `True` for BioRecord registries. keep: When a synonym maps to multiple names, determines which duplicates to mark as `pd.DataFrame.duplicated`: - `"first"`: returns the first mapped standardized name - `"last"`: returns the last mapped standardized name - `False`: returns all mapped standardized name. When `keep` is `False`, the returned list of standardized names will contain nested lists in case of duplicates. When a field is converted into return_field, keep marks which matches to keep when multiple return_field values map to the same field value. synonyms_field: A field containing the concatenated synonyms. organism: An Organism name or record. source: A `bionty.Source` record that specifies the version to validate against. strict_source: Determines the validation behavior against records in the registry. - If `False`, validation will include all records in the registry, ignoring the specified source. - If `True`, validation will only include records in the registry that are linked to the specified source. Note: this parameter won't affect validation against public sources. Returns: If `return_mapper` is `False`: a list of standardized names. Otherwise, a dictionary of mapped values with mappable synonyms as keys and standardized names as values. See Also: :meth:`~lamindb.models.CanCurate.add_synonym` Add synonyms. :meth:`~lamindb.models.CanCurate.remove_synonym` Remove synonyms. Example:: import bionty as bt # save some gene records bt.Gene.from_values(["A1CF", "A1BG", "BRCA2"], field="symbol", organism="human").save() # standardize gene synonyms gene_synonyms = ["A1CF", "A1BG", "FANCD1", "FANCD20"] bt.Gene.standardize(gene_synonyms) #> ['A1CF', 'A1BG', 'BRCA2', 'FANCD20'] """ return _standardize( cls=cls, values=values, field=field, return_field=return_field, return_mapper=return_mapper, case_sensitive=case_sensitive, mute=mute, strict_source=strict_source, from_source=from_source, keep=keep, synonyms_field=synonyms_field, organism=organism, source=source, ) def add_synonym( self, synonym: str | ListLike, force: bool = False, save: bool | None = None, ): """Add synonyms to a record. Args: synonym: The synonyms to add to the record. force: Whether to add synonyms even if they are already synonyms of other records. save: Whether to save the record to the database. See Also: :meth:`~lamindb.models.CanCurate.remove_synonym` Remove synonyms. Example:: import bionty as bt # save "T cell" record record = bt.CellType.from_source(name="T cell").save() record.synonyms #> "T-cell|T lymphocyte|T-lymphocyte" # add a synonym record.add_synonym("T cells") record.synonyms #> "T cells|T-cell|T-lymphocyte|T lymphocyte" """ _check_synonyms_field_exist(self) _add_or_remove_synonyms( synonym=synonym, record=self, force=force, action="add", save=save ) def remove_synonym(self, synonym: str | ListLike): """Remove synonyms from a record. Args: synonym: The synonym values to remove. See Also: :meth:`~lamindb.models.CanCurate.add_synonym` Add synonyms Example:: import bionty as bt # save "T cell" record record = bt.CellType.from_source(name="T cell").save() record.synonyms #> "T-cell|T lymphocyte|T-lymphocyte" # remove a synonym record.remove_synonym("T-cell") record.synonyms #> "T lymphocyte|T-lymphocyte" """ _check_synonyms_field_exist(self) _add_or_remove_synonyms(synonym=synonym, record=self, action="remove") def set_abbr(self, value: str): """Set value for abbr field and add to synonyms. Args: value: A value for an abbreviation. See Also: :meth:`~lamindb.models.CanCurate.add_synonym` Example:: import bionty as bt # save an experimental factor record scrna = bt.ExperimentalFactor.from_source(name="single-cell RNA sequencing").save() assert scrna.abbr is None assert scrna.synonyms == "single-cell RNA-seq|single-cell transcriptome sequencing|scRNA-seq|single cell RNA sequencing" # set abbreviation scrna.set_abbr("scRNA") assert scrna.abbr == "scRNA" # synonyms are updated assert scrna.synonyms == "scRNA|single-cell RNA-seq|single cell RNA sequencing|single-cell transcriptome sequencing|scRNA-seq" """ self.abbr = value if hasattr(self, "name") and value == self.name: pass else: try: self.add_synonym(value, save=False) except Exception as e: # pragma: no cover logger.debug( f"Encountered an Exception while attempting to add synonyms.\n{e}" ) if not self._state.adding: # type: ignore self.save() # type: ignore ================================================ FILE: lamindb/models/collection.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any, Literal, overload from django.db import models from django.db.models import CASCADE, PROTECT, Q from lamin_utils import logger from lamindb_setup.core.hashing import HASH_LENGTH, hash_set from lamindb.base.fields import ( CharField, ForeignKey, OneToOneField, TextField, ) from lamindb.base.utils import strict_classmethod from ..base.uids import base62_20 from ..errors import FieldValidationError from ..models._is_versioned import process_revises from ._is_versioned import IsVersioned from .artifact import ( Artifact, get_run, populate_subsequent_run, save_schema_links, track_run_input, ) from .has_parents import view_lineage from .run import Run, TracksRun, TracksUpdates from .sqlrecord import ( BaseSQLRecord, IsLink, SQLRecord, _get_record_kwargs, init_self_from_db, update_attributes, ) if TYPE_CHECKING: from collections.abc import Iterable, Iterator import anndata as ad import pandas as pd from polars import LazyFrame as PolarsLazyFrame from pyarrow.dataset import Dataset as PyArrowDataset from ..core._mapped_collection import MappedCollection from ..core.storage import UPath from .block import CollectionBlock from .project import Project, Reference from .query_manager import RelatedManager from .query_set import QuerySet from .record import Record from .transform import Transform from .ulabel import ULabel def _load_concat_artifacts( artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs ) -> pd.DataFrame | ad.AnnData: import anndata as ad import pandas as pd suffixes = {artifact.suffix for artifact in artifacts} if len(suffixes) != 1: raise ValueError( "Can only load collections where all artifacts have the same suffix" ) # because we're tracking data flow on the collection-level, here, we don't # want to track it on the artifact-level first_object = artifacts[0].load(is_run_input=False) is_dataframe = isinstance(first_object, pd.DataFrame) is_anndata = isinstance(first_object, ad.AnnData) if not is_dataframe and not is_anndata: raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.") objects = [first_object] artifact_uids = [artifacts[0].uid] for artifact in artifacts[1:]: objects.append(artifact.load(is_run_input=False)) artifact_uids.append(artifact.uid) if is_dataframe: concat_object = pd.concat(objects, join=join, **kwargs) elif is_anndata: label = kwargs.pop("label", "artifact_uid") keys = kwargs.pop("keys", artifact_uids) concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs) return concat_object class Collection(SQLRecord, IsVersioned, TracksRun, TracksUpdates): """Versioned collections of artifacts. Args: artifacts: `Artifact | list[Artifact]` One or several artifacts. key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`. description: `str | None = None` A description. meta: `Artifact | None = None` An artifact that defines metadata for the collection. reference: `str | None = None` A simple reference, e.g. an external ID or a URL. reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`. run: `Run | None = None` The run that creates the collection. revises: `Collection | None = None` An old version of the collection. skip_hash_lookup: `bool = False` Skip the hash lookup so that a new collection is created even if a collection with the same hash already exists. See Also: :class:`~lamindb.Artifact` Examples: Create a collection from a list of :class:`~lamindb.Artifact` objects:: collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection") Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):: collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact) """ class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" constraints = [ models.UniqueConstraint( fields=["key", "hash"], name="unique_collection_key_hash_not_null", ) ] _len_full_uid: int = 20 _len_stem_uid: int = 16 _name_field: str = "key" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=_len_full_uid, default=base62_20, ) """Universal id, valid across DB instances.""" key: str = CharField(db_index=True) """Name or path-like key.""" # below is the only case in which we use a TextField # for description; we do so because users had descriptions exceeding 255 chars # in their instances description: str | None = TextField(null=True) """A description or title.""" hash: str | None = CharField( max_length=HASH_LENGTH, db_index=True, null=True, ) """Hash of collection content.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) """A reference like URL or external ID.""" # also for reference_type here, we allow an extra long max_length reference_type: str | None = CharField(max_length=25, db_index=True, null=True) """Type of reference, e.g., cellxgene Census collection_id.""" ulabels: RelatedManager[ULabel] = models.ManyToManyField( "ULabel", through="CollectionULabel", related_name="collections" ) """ULabels annotating the collection (see :class:`~lamindb.Feature`) ← :attr:`~lamindb.ULabel.collections`.""" run: Run | None = ForeignKey( Run, PROTECT, related_name="output_collections", null=True, default=None ) """:class:`~lamindb.Run` that created the `collection` ← :attr:`~lamindb.Run.output_collections`.""" input_of_runs: RelatedManager[Run] = models.ManyToManyField( Run, related_name="input_collections" ) """Runs that use this collection as an input ← :attr:`~lamindb.Run.input_collections`.""" recreating_runs: RelatedManager[Run] = models.ManyToManyField( "Run", related_name="recreated_collections", ) """Runs that re-created the record after initial creation ← :attr:`~lamindb.Run.recreated_collections`.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", related_name="collections", through="CollectionArtifact" ) """Artifacts in collection ← :attr:`~lamindb.Artifact.collections`.""" meta_artifact: Artifact | None = OneToOneField( "Artifact", PROTECT, null=True, unique=True, related_name="_meta_of_collection", ) """An artifact that stores metadata that indexes a collection. It has a 1:1 correspondence with an artifact. If needed, you can access the collection from the artifact via a private field: `artifact._meta_of_collection`. """ linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordCollection", related_name="linked_collections" ) """This collection is linked in these records as a value ← :attr:`~lamindb.Record.linked_collections`.""" _actions: RelatedManager[Artifact] = models.ManyToManyField( Artifact, related_name="+" ) """Actions to attach for the UI.""" projects: RelatedManager[Project] """Linked projects ← :attr:`~lamindb.Project.collections`.""" references: RelatedManager[Reference] """Linked references ← :attr:`~lamindb.Reference.collections`.""" records: RelatedManager[Record] """Linked records ← :attr:`~lamindb.Record.collections`.""" ablocks: RelatedManager[CollectionBlock] """Attached blocks ← :attr:`~lamindb.CollectionBlock.collection`.""" @overload def __init__( self, artifacts: Artifact | list[Artifact], key: str, description: str | None = None, meta: Any | None = None, reference: str | None = None, reference_type: str | None = None, run: Run | None = None, revises: Collection | None = None, skip_hash_lookup: bool = False, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None # now we proceed with the user-facing constructor if len(args) > 1: raise ValueError("Only one non-keyword arg allowed: artifacts") artifacts: Artifact | list[Artifact] = ( kwargs.pop("artifacts") if len(args) == 0 else args[0] ) meta_artifact: Artifact | None = kwargs.pop("meta_artifact", None) key: str | None = kwargs.pop("key", None) description: str | None = kwargs.pop("description", None) reference: str | None = kwargs.pop("reference", None) reference_type: str | None = kwargs.pop("reference_type", None) run: Run | None = kwargs.pop("run", None) revises: Collection | None = kwargs.pop("revises", None) version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None)) skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False) branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) if not len(kwargs) == 0: valid_keywords = ", ".join( [val[0] for val in _get_record_kwargs(Collection)] ) raise FieldValidationError( f"Only {valid_keywords} can be passed, you passed: {kwargs}" ) if revises is None: revises = ( Collection.filter(key=key, is_latest=True) .order_by("-created_at") .first() ) provisional_uid, version_tag, key, description, revises = process_revises( revises, version_tag, key, description, Collection ) run = get_run(run) if isinstance(artifacts, Artifact): artifacts = [artifacts] else: if not hasattr(artifacts, "__getitem__"): raise ValueError("Artifact or list[Artifact] is allowed.") assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101 hash = from_artifacts(artifacts) # type: ignore if meta_artifact is not None: if not isinstance(meta_artifact, Artifact): raise ValueError("meta_artifact has to be an Artifact") if isinstance(meta_artifact, Artifact): if meta_artifact._state.adding: raise ValueError( "Save meta_artifact artifact before creating collection!" ) # we ignore collections in trash containing the same hash if hash is not None and not skip_hash_lookup: # this purposefully leaves out the key that we have # in the hard database unique constraint # so that the user is able to find collections with the same hash across # keys # if this is not desired, set skip_hash_lookup=True existing_collection = Collection.objects.filter( ~Q(branch_id=-1), hash=hash, ).first() else: existing_collection = None if existing_collection is not None: logger.warning( f"returning collection with same hash: {existing_collection}; if you intended to query to track this collection as an input, use: ln.Collection.get()" ) init_self_from_db(self, existing_collection) update_attributes(self, {"description": description, "key": key}) populate_subsequent_run(self, run) else: _skip_validation = revises is not None and key == revises.key super().__init__( # type: ignore uid=provisional_uid, key=key, description=description, reference=reference, reference_type=reference_type, meta_artifact=meta_artifact, hash=hash, run=run, version_tag=version_tag, branch=branch, branch_id=branch_id, space=space, space_id=space_id, revises=revises, _skip_validation=_skip_validation, ) self._artifacts = artifacts if revises is not None and revises.uid != self.uid: track_run_input(revises, run=run) track_run_input(artifacts, run=run) @strict_classmethod def get( cls, idlike: int | str | None = None, *, is_run_input: bool | Run = False, **expressions, ) -> Artifact: """Get a single collection. Args: idlike: Either a uid stub, uid or an integer id. is_run_input: Whether to track this collection as run input. expressions: Fields and values passed as Django query expressions. Raises: :exc:`lamindb.errors.DoesNotExist`: In case no matching record is found. See Also: - Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get` Examples: :: collection = ln.Collection.get("okxPW6GIKBfRBE3B0000") collection = ln.Collection.get(key="scrna/collection1") """ from .query_set import QuerySet return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions) def append(self, artifact: Artifact, run: Run | None = None) -> Collection: """Append an artifact to the collection. This does not modify the original collection in-place, but returns a new version of the original collection with the appended artifact. Args: artifact: An artifact to add to the collection. run: The run that creates the new version of the collection. Examples: :: collection_v1 = ln.Collection(artifact, key="My collection").save() collection_v2 = collection.append(another_artifact) # returns a new version of the collection collection_v2.save() # save the new version """ return Collection( # type: ignore self.artifacts.all().to_list() + [artifact], # key is automatically derived from revises.key description=self.description, revises=self, run=run, ) def open( self, engine: Literal["pyarrow", "polars"] = "pyarrow", is_run_input: bool | None = None, **kwargs, ) -> PyArrowDataset | Iterator[PolarsLazyFrame]: """Open a dataset for streaming. Works for `pyarrow` and `polars` compatible formats (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files). Args: engine: Which module to use for lazy loading of a dataframe from `pyarrow` or `polars` compatible formats. is_run_input: Whether to track this artifact as run input. **kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions. Notes: For more info, see guide: :doc:`/arrays`. """ if self._state.adding: artifacts = self._artifacts logger.warning("the collection isn't saved, consider calling `.save()`") else: artifacts = self.ordered_artifacts.all() paths = [artifact.path for artifact in artifacts] from ..core.storage._backed_access import _open_dataframe dataframe = _open_dataframe(paths, engine=engine, **kwargs) # track only if successful track_run_input(self, is_run_input) return dataframe def mapped( self, layers_keys: str | list[str] | None = None, obs_keys: str | list[str] | None = None, obsm_keys: str | list[str] | None = None, obs_filter: dict[str, str | list[str]] | None = None, join: Literal["inner", "outer"] | None = "inner", encode_labels: bool | list[str] = True, unknown_label: str | dict[str, str] | None = None, cache_categories: bool = True, parallel: bool = False, dtype: str | None = None, stream: bool = False, is_run_input: bool | None = None, ) -> MappedCollection: """Return a map-style dataset. Returns a `pytorch map-style dataset `__ by virtually concatenating `AnnData` arrays. By default (`stream=False`) `AnnData` arrays are moved into a local cache first. `__getitem__` of the `MappedCollection` object takes a single integer index and returns a dictionary with the observation data sample for this index from the `AnnData` objects in the collection. The dictionary has keys for `layers_keys` (`.X` is in `"X"`), `obs_keys`, `obsm_keys` (under `f"obsm_{key}"`) and also `"_store_idx"` for the index of the `AnnData` object containing this observation sample. .. note:: For a guide, see :doc:`docs:scrna-mappedcollection`. This method currently only works for collections or query sets of `AnnData` artifacts. Args: layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list retrieves ``.X``. obs_keys: Keys from the ``.obs`` slots. obsm_keys: Keys from the ``.obsm`` slots. obs_filter: Select only observations with these values for the given obs columns. Should be a dictionary with obs column names as keys and filtering values (a string or a list of strings) as values. join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed, does not join. encode_labels: Encode labels into integers. Can be a list with elements from ``obs_keys``. unknown_label: Encode this label to -1. Can be a dictionary with keys from ``obs_keys`` if ``encode_labels=True`` or from ``encode_labels`` if it is a list. cache_categories: Enable caching categories of ``obs_keys`` for faster access. parallel: Enable sampling with multiple processes. dtype: Convert numpy arrays from ``.X``, ``.layers`` and ``.obsm`` stream: Whether to stream data from the array backend. is_run_input: Whether to track this collection as run input. Examples: >>> import lamindb as ln >>> from torch.utils.data import DataLoader >>> ds = ln.Collection.get(description="my collection") >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"]) >>> dl = DataLoader(mapped, batch_size=128, shuffle=True) >>> # also works for query sets of artifacts, '...' represents some filtering condition >>> # additional filtering on artifacts of the collection >>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped() >>> # or directly from a query set of artifacts >>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped() """ from ..core._mapped_collection import MappedCollection path_list = [] if self._state.adding: artifacts = self._artifacts logger.warning("the collection isn't saved, consider calling `.save()`") else: artifacts = self.ordered_artifacts.all() for artifact in artifacts: if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix: logger.warning(f"ignoring artifact with suffix {artifact.suffix}") continue elif not stream: path_list.append(artifact.cache()) else: path_list.append(artifact.path) ds = MappedCollection( path_list, layers_keys, obs_keys, obsm_keys, obs_filter, join, encode_labels, unknown_label, cache_categories, parallel, dtype, ) # track only if successful track_run_input(self, is_run_input) return ds def cache(self, is_run_input: bool | None = None) -> list[UPath]: """Download cloud artifacts in collection to local cache. Follows syncing logic: only downloads outdated artifacts. Returns ordered paths to locally cached on-disk artifacts via `.ordered_artifacts.all()`: Args: is_run_input: Whether to track this collection as run input. """ path_list = [] for artifact in self.ordered_artifacts.all(): # do not want to track data lineage on the artifact level path_list.append(artifact.cache(is_run_input=False)) track_run_input(self, is_run_input) return path_list def load( self, join: Literal["inner", "outer"] = "outer", is_run_input: bool | None = None, **kwargs, ) -> pd.DataFrame | ad.AnnData: """Cache and load to memory. Returns an in-memory concatenated `DataFrame` or `AnnData` object. """ # cannot call track_run_input here, see comment further down artifacts = self.ordered_artifacts.all() concat_object = _load_concat_artifacts(artifacts, join, **kwargs) # only call it here because there might be errors during load or concat track_run_input(self, is_run_input) return concat_object def save(self, using: str | None = None) -> Collection: """Save the collection and underlying artifacts to database & storage. Args: using: The database to which you want to save. Examples: >>> collection = ln.Collection("./myfile.csv", name="myfile") """ if self.meta_artifact is not None: self.meta_artifact.save() super().save() # we don't allow updating the collection of artifacts # if users want to update the set of artifacts, they # have to create a new collection if hasattr(self, "_artifacts"): links = [ CollectionArtifact(collection_id=self.id, artifact_id=artifact.id) # type: ignore for artifact in self._artifacts ] # the below seems to preserve the order of the list in the # auto-incrementing integer primary # merely using .artifacts.set(*...) doesn't achieve this # we need ignore_conflicts=True so that this won't error if links already exist CollectionArtifact.objects.bulk_create(links, ignore_conflicts=True) save_schema_links(self) if using is not None: logger.warning("using argument is ignored") return self def restore(self) -> None: """Restore collection record from trash. Examples: For any `Collection` object `collection`, call: >>> collection.restore() """ self.branch_id = 1 self.save() @property def transform(self) -> Transform | None: """Transform whose run created the collection.""" return self.run.transform if self.run is not None else None @property def name(self) -> str: """Name of the collection. Splits `key` on `/` and returns the last element. """ return self.key.split("/")[-1] @property def ordered_artifacts(self) -> QuerySet: """Ordered `QuerySet` of `.artifacts`. Accessing the many-to-many field `collection.artifacts` directly gives you non-deterministic order. Using the property `.ordered_artifacts` allows to iterate through a set that's ordered by the order of the list that created the collection. """ return self.artifacts.order_by("links_collection__id") @property def data_artifact(self) -> Artifact | None: """Access to a single data artifact. If the collection has a single data & metadata artifact, this allows access via:: collection.data_artifact # first & only element of collection.artifacts collection.meta_artifact # metadata """ return self.artifacts.first() # internal function, not exposed to user def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]: # assert all artifacts are already saved saved = not any(artifact._state.adding for artifact in artifacts) if not saved: raise ValueError("Not all artifacts are yet saved, please save them") # validate consistency of hashes - we do not allow duplicate hashes hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None] hashes_set = set(hashes) if len(hashes) != len(hashes_set): seen = set() non_unique = [x for x in hashes if x in seen or seen.add(x)] # type: ignore logger.warning( f"your collection contains artifacts with non-unique hashes: {non_unique}" ) hash = hash_set(hashes_set) return hash class CollectionArtifact(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_artifact" ) artifact: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_collection") class Meta: app_label = "lamindb" unique_together = ("collection", "artifact") # mypy: ignore-errors Collection.view_lineage = view_lineage ================================================ FILE: lamindb/models/feature.py ================================================ from __future__ import annotations import importlib import warnings from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast, get_args, overload import numpy as np import pgtrigger from django.conf import settings as django_settings from django.db import connection, models from django.db.models import CASCADE, PROTECT from django.db.models.query_utils import DeferredAttribute from django.db.utils import IntegrityError as DjangoIntegrityError from lamin_utils import logger from lamindb_setup._init_instance import get_schema_module_name from lamindb_setup.core import deprecated from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict, hash_string from lamindb_setup.errors import ( MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE, ModuleWasntConfigured, ) from lamindb.base.fields import ( BooleanField, CharField, ForeignKey, JSONField, TextField, ) from lamindb.base.types import DtypeStr, FieldAttr from lamindb.errors import ( FieldValidationError, IntegrityError, InvalidArgument, ValidationError, ) from ..base.uids import base62_12 from ._relations import dict_module_name_to_model_name from .can_curate import CanCurate from .has_parents import _query_relatives from .query_set import QuerySet, SQLRecordList from .run import ( TracksRun, TracksUpdates, ) from .sqlrecord import BaseSQLRecord, HasType, Registry, SQLRecord, _get_record_kwargs if TYPE_CHECKING: from collections.abc import Iterable import pandas as pd from pandas.core.dtypes.base import ExtensionDtype from .artifact import Artifact from .block import FeatureBlock from .project import Project from .query_manager import RelatedManager from .record import Record from .run import Run from .schema import Schema from .ulabel import ULabel FEATURE_DTYPES = set(get_args(DtypeStr)) @dataclass(frozen=True) class FeaturePredicate: """Predicate generated by comparing a Feature to a value.""" feature: Feature comparator: str value: Any def __bool__(self) -> bool: raise TypeError( "Feature predicates cannot be used as booleans. " "Pass them into `.filter(...)`." ) def parse_dtype( dtype_str: str, check_exists: bool = False, old_format: bool = False ) -> list[dict[str, Any]]: """Parses feature data type string into a structured list of components.""" from .artifact import Artifact allowed_dtypes = FEATURE_DTYPES # Handle list[...] types if dtype_str.startswith("list[") and dtype_str.endswith("]"): inner_dtype_str = dtype_str[5:-1] # Remove "list[" and "]" # Recursively parse the inner type inner_result = parse_dtype(inner_dtype_str, old_format=old_format) # Add "list": True to each component for component in inner_result: if isinstance(component, dict): component["list"] = True # type: ignore return inner_result is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]") result: list[dict[str, Any]] = [] # backward compatibility for bare "cat" dtype (deprecated) if dtype_str == "cat": return result if is_composed_cat: related_registries = dict_module_name_to_model_name(Artifact) registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ] if registries_str != "": registry_str_list = registries_str.split("|") for cat_single_dtype_str in registry_str_list: single_result = parse_cat_dtype( cat_single_dtype_str, related_registries=related_registries, check_exists=check_exists, old_format=old_format, ) result.append(single_result) elif dtype_str not in allowed_dtypes: raise ValueError( f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!" ) return result def get_record_type_from_uid( registry: Registry, record_uid: str, ) -> SQLRecord: type_record: SQLRecord = registry.get(record_uid) if type_record.branch_id == -1: warning_msg = f"retrieving {registry.__name__} type '{type_record.name}' (uid='{record_uid}') from trash" logger.warning(warning_msg) if not type_record.is_type: raise InvalidArgument( f"The resolved {type_record.__class__.__name__} '{type_record.name}' (uid='{record_uid}') is not a type: is_type is False." ) return type_record def get_record_type_from_nested_subtypes( registry: Registry, subtypes_list: list[str], field_str: str ) -> SQLRecord: """Get a record type by querying nested subtypes using raw SQL. This function only works with Record or ULabel registries. """ table_name = registry._meta.db_table final_name = subtypes_list[-1] # Build the SQL query with nested joins # For subtypes_list = ["A", "B", "C"], we want: # - Record with name="C" # - Its type has name="B" # - That type's type has name="A" params: list[str | bool] if len(subtypes_list) > 1: # Build nested joins for parent types parent_types = list(reversed(subtypes_list[:-1])) joins = [] where_clauses = ["t0.name = %s"] # Final record name params = [final_name] for i, parent_type_name in enumerate(parent_types): alias = f"t{i + 1}" prev_alias = f"t{i}" joins.append( f"INNER JOIN {table_name} {alias} ON {prev_alias}.type_id = {alias}.id" ) where_clauses.append(f"{alias}.name = %s") where_clauses.append(f"{alias}.is_type = %s") params.extend([parent_type_name, True]) join_clause = " ".join(joins) where_clause = " AND ".join(where_clauses) query = f""" SELECT t0.* FROM {table_name} t0 {join_clause} WHERE {where_clause} LIMIT 1 """ else: # Single type, no parent - type must be NULL query = f""" SELECT * FROM {table_name} WHERE name = %s AND type_id IS NULL LIMIT 1 """ params = [final_name] try: with connection.cursor() as cursor: cursor.execute(query, params) columns = [col[0] for col in cursor.description] rows = cursor.fetchall() if not rows: raise IntegrityError( f"No {registry.__name__} type found matching subtypes {subtypes_list} for field `.{field_str}`" ) if len(rows) > 1: raise IntegrityError( f"Multiple {registry.__name__} types found matching subtypes {subtypes_list} for field `.{field_str}`" ) # Create a dictionary from the row data row_dict = dict(zip(columns, rows[0])) # Create a minimal mock object with only the fields we need # This avoids querying the database which may not have all columns during migrations # We create a simple object and set its class to the registry for proper error messages type_record: SQLRecord = object.__new__(registry) type_record.id = row_dict.get("id") type_record.uid = row_dict.get("uid") type_record.name = row_dict.get("name") type_record.is_type = row_dict.get("is_type", False) # Initialize _state attribute needed by Django models # Create a minimal state object with the required attributes state = type("ModelState", (), {"adding": False, "db": "default"})() type_record._state = state except IntegrityError: raise except Exception as e: raise IntegrityError( f"Error retrieving {registry.__name__} type with subtypes {subtypes_list} for field `.{field_str}`: {e}" ) from e if not type_record.is_type: raise InvalidArgument( f"The resolved {type_record.__class__.__name__} '{type_record.name}' for field `.{field_str}` is not a type: is_type is False." ) return type_record def dtype_as_object(dtype_str: str, old_format: bool = False) -> type | None: def _dtype_as_object_simple(dtype_str: str) -> type | None: if dtype_str == "str": return str elif dtype_str == "url": return str elif dtype_str == "int": return int elif dtype_str in ("float", "num"): return float elif dtype_str == "bool": return bool elif dtype_str == "date": from datetime import date return date elif dtype_str == "datetime": from datetime import datetime return datetime elif dtype_str.startswith("dict"): return dict return None if dtype_str is None: return None parsed_dtypes = parse_dtype(dtype_str, check_exists=True, old_format=old_format) if len(parsed_dtypes) > 0: dtype_objects = [] for parsed_dtype in parsed_dtypes: if parsed_dtype.get("record_uid"): # return the subtype record for dtypes with record_uid dtype_object = get_record_type_from_uid( parsed_dtype["registry"], parsed_dtype["record_uid"], ) elif parsed_dtype.get("subtypes_list"): dtype_object = get_record_type_from_nested_subtypes( parsed_dtype["registry"], parsed_dtype["subtypes_list"], parsed_dtype["field"], ) else: # return field for dtypes without record_uid, e.g. bt.CellType.ontology_id dtype_object = parsed_dtype["field"] # for list, returns list[SQLRecord] dtype_objects.append( list[dtype_object] # type: ignore if "list" in parsed_dtype and parsed_dtype["list"] else dtype_object ) return dtype_objects if len(dtype_objects) > 1 else dtype_objects[0] # type: ignore elif dtype_str.startswith("list["): # for simple lists, returns list[python_type] dtype_simple_object = _dtype_as_object_simple( dtype_str.removeprefix("list[").removesuffix("]") ) return ( list[dtype_simple_object] if dtype_simple_object is not None else list # type: ignore ) else: return _dtype_as_object_simple(dtype_str) def parse_cat_dtype( dtype_str: str, related_registries: dict[str, SQLRecord] | None = None, is_itype: bool = False, check_exists: bool = False, old_format: bool = False, ) -> dict[str, Any]: """Parses a categorical dtype string into its components (registry, field, subtypes).""" from .artifact import Artifact assert isinstance(dtype_str, str) # noqa: S101 if related_registries is None: related_registries = dict_module_name_to_model_name(Artifact) # Parse the string considering nested brackets parsed = parse_nested_brackets(dtype_str, old_format=old_format) registry_str = parsed["registry"] filter_str = parsed["filter_str"] field_str = parsed["field"] if not is_itype: if registry_str not in related_registries: raise ValidationError( f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType" ) registry = related_registries[registry_str] else: if "." in registry_str: registry_str_split = registry_str.split(".") assert len(registry_str_split) == 2, registry_str # noqa: S101 module_name_attempt, class_name = registry_str_split module_name = get_schema_module_name( module_name_attempt, raise_import_error=False ) if module_name is None: raise ModuleWasntConfigured( MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format( module_name_attempt, module_name_attempt ) ) else: module_name, class_name = "lamindb", registry_str module = importlib.import_module(module_name) registry = getattr(module, class_name) if field_str == "": field_str = registry._name_field if hasattr(registry, "_name_field") else "name" assert hasattr(registry, field_str), f"{registry} has no field {field_str}" record_uid = parsed.get("record_uid") subtypes_list = parsed.get("subtypes_list") # Handle old format (subtypes_list) or new format (record_uid) if subtypes_list and check_exists: # Old format: validate that the Record exists using nested subtypes # subtypes_list is guaranteed to be list[str] when present if isinstance(subtypes_list, list): get_record_type_from_nested_subtypes( registry, cast(list[str], subtypes_list), field_str ) elif record_uid and check_exists: get_record_type_from_uid(registry, record_uid) if filter_str != "": # TODO: validate or process filter string pass result = { "registry": registry, # should be typed as CanCurate "registry_str": registry_str, "filter_str": filter_str, "field_str": field_str, "field": getattr(registry, field_str), } # Add record_uid if it exists (new format) if record_uid: result["record_uid"] = record_uid # Add subtypes_list if it exists (old format) if subtypes_list: result["subtypes_list"] = subtypes_list return result def parse_nested_brackets(dtype_str: str, old_format: bool = False) -> dict[str, Any]: """Parse dtype string with potentially nested brackets. Examples: "A" -> {"registry": "A", "filter_str": "", "field": ""} "A.field" -> {"registry": "A", "filter_str": "", "field": "field"} "Record[abcdefg123456]" -> {"registry": "Record", "filter_str": "", "field": "", "record_uid": "abcdefg123456"} "Record[abcdefg123456].name" -> {"registry": "Record", "filter_str": "", "field": "name", "record_uid": "abcdefg123456"} "bionty.Gene.ensembl_gene_id[source__id='abcd']" -> {"registry": "bionty.Gene", "filter_str": "source__id='abcd'", "field": "ensembl_gene_id"} Args: dtype_str: The dtype string to parse Returns: Dictionary with parsed components """ if "[" not in dtype_str: # No brackets - handle simple cases like "A" or "A.field" if "." in dtype_str: parts = dtype_str.split(".") if len(parts) == 2 and parts[1][0].isupper(): # bionty.CellType return {"registry": dtype_str, "filter_str": "", "field": ""} elif len(parts) == 3: # bionty.CellType.name return { "registry": f"{parts[0]}.{parts[1]}", "filter_str": "", "field": parts[2], } else: # ULabel.name return {"registry": parts[0], "filter_str": "", "field": parts[1]} else: # Simple registry name return {"registry": dtype_str, "filter_str": "", "field": ""} # Find the first opening bracket first_bracket = dtype_str.index("[") # Handle case where registry_part contains a field (e.g., "bionty.Gene.ensembl_gene_id[filters]") registry_and_field = dtype_str[:first_bracket] if "." in registry_and_field: parts = registry_and_field.split(".") if len(parts) == 3: registry_part = f"{parts[0]}.{parts[1]}" pre_bracket_field = parts[2] else: registry_part = registry_and_field pre_bracket_field = "" else: registry_part = registry_and_field pre_bracket_field = "" # Find the matching closing bracket for the first opening bracket bracket_count = 0 closing_bracket_pos = -1 for i in range(first_bracket, len(dtype_str)): if dtype_str[i] == "[": bracket_count += 1 elif dtype_str[i] == "]": bracket_count -= 1 if bracket_count == 0: closing_bracket_pos = i break if closing_bracket_pos == -1: raise ValueError(f"Unmatched brackets in dtype string: {dtype_str}") # Extract content between brackets bracket_content = dtype_str[first_bracket + 1 : closing_bracket_pos] # Check for field after the closing bracket field_part = "" remainder = dtype_str[closing_bracket_pos + 1 :] if remainder.startswith("."): field_part = remainder[1:] # Remove the dot # Use pre_bracket_field if no post_bracket field if not field_part and pre_bracket_field: field_part = pre_bracket_field # Extract UID, subtypes_list, or filter from bracket content # For UID-based format: Record[uid] or ULabel[uid] -> record_uid # For old name-based format: Record[Name] or Record[Parent[Child]] -> subtypes_list # For filter format: registry.field[filter] -> filter_str record_uid = None subtypes_list = None filter_str = "" # If registry is Record or ULabel, bracket content could be UID or name(s) if registry_part in ("Record", "ULabel"): if bracket_content: if old_format: # Old format with nested brackets like Record[Parent[Child]] extracted = extract_subtypes_and_filter(bracket_content) subtypes_list = extracted["subtypes_list"] filter_str = extracted["filter_str"] else: record_uid = bracket_content else: # For other registries, bracket content is a filter filter_str = bracket_content if bracket_content else "" result = { "registry": registry_part, "filter_str": filter_str, "field": field_part, } # Add record_uid if it exists (new format) if record_uid: result["record_uid"] = record_uid # Add subtypes_list if it exists (old format) if subtypes_list: result["subtypes_list"] = subtypes_list return result def extract_subtypes_and_filter(subtype_str: str) -> dict[str, Any]: """Extract nested subtypes and optional filter from a nested subtype string. Examples: "B" -> {"subtypes_list": ["B"], "filter_str": ""} "B[C]" -> {"subtypes_list": ["B", "C"], "filter_str": ""} "B[C[filter='']]" -> {"subtypes_list": ["B", "C"], "filter_str": "filter=''"} "B[C[D]]" -> {"subtypes_list": ["B", "C", "D"], "filter_str": ""} "B[C[D[E]]]" -> {"subtypes_list": ["B", "C", "D", "E"], "filter_str": ""} "B[filter='value']" -> {"subtypes_list": ["B"], "filter_str": "filter='value'"} "Customer[UScustomer[region='US']]" -> {"subtypes_list": ["Customer", "UScustomer"], "filter_str": "region='US'"} Args: subtype_str: The subtype string with potential nesting Returns: Dictionary with subtypes_list and filter_str """ subtypes: list[str] = [] filter_str = "" current = subtype_str while current: if "[" not in current: # No more brackets if current and "=" not in current: # It's a subtype name subtypes.append(current) elif current and "=" in current: # It's a filter filter_str = current break # Find the first part before the bracket bracket_pos = current.index("[") part = current[:bracket_pos] # Add the part (it's a subtype name) if part: subtypes.append(part) # Find the matching closing bracket bracket_count = 0 closing_pos = -1 for i in range(bracket_pos, len(current)): if current[i] == "[": bracket_count += 1 elif current[i] == "]": bracket_count -= 1 if bracket_count == 0: closing_pos = i break if closing_pos == -1: break # Move to the content inside the brackets current = current[bracket_pos + 1 : closing_pos] return {"subtypes_list": subtypes, "filter_str": filter_str} def serialize_dtype( dtype: Registry | SQLRecord | FieldAttr | list[SQLRecord] | list[Registry] | list[str] | list[float] | str | type, is_itype: bool = False, ) -> str: """Converts a data type object into its string representation.""" from .record import Record from .ulabel import ULabel # Handle generic types like list[str], list[Registry], etc. if hasattr(dtype, "__origin__") and dtype.__origin__ is list: # Get the inner type from list[T] inner_type = dtype.__args__[0] if dtype.__args__ else None # type: ignore if inner_type is not None: # Recursively serialize the inner type inner_dtype_str = serialize_dtype(inner_type, is_itype=is_itype) return f"list[{inner_dtype_str}]" if ( not isinstance(dtype, list) and hasattr(dtype, "__name__") and dtype.__name__ in FEATURE_DTYPES ): dtype_str = dtype.__name__ elif dtype is dict: dtype_str = "dict" elif is_itype and isinstance(dtype, str): if dtype not in "Feature": parse_cat_dtype( dtype_str=dtype, is_itype=True ) # throws an error if invalid dtype_str = dtype else: from pandas.core.dtypes.base import ExtensionDtype if isinstance(dtype, (ExtensionDtype, np.dtype)): dtype_str = serialize_pandas_dtype(dtype) else: error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}" if isinstance(dtype, (Registry, DeferredAttribute, ULabel, Record)): dtype = [dtype] elif not isinstance(dtype, list): raise ValueError(error_message.format(dtype)) dtype_str = "" for one_dtype in dtype: if not isinstance( one_dtype, (Registry, DeferredAttribute, ULabel, Record) ): raise ValueError(error_message.format(one_dtype)) if isinstance(one_dtype, Registry): dtype_str += one_dtype.__get_name_with_module__() + "|" elif isinstance(one_dtype, (ULabel, Record)): if one_dtype._state.adding: raise InvalidArgument( f"Cannot serialize unsaved objects. Save {one_dtype} via `.save()`." ) if not one_dtype.is_type: raise InvalidArgument( f"Cannot serialize non-type {one_dtype.__class__.__name__} '{one_dtype.name}'. Only types (is_type=True) are allowed in dtypes." ) # Use UID-based format: Record[uid] instead of Record[Parent[Child]] nested_string = f"[{one_dtype.uid}]" if isinstance(one_dtype, ULabel): dtype_str += f"ULabel{nested_string}" else: dtype_str += f"Record{nested_string}" else: name = one_dtype.field.name field_ext = f".{name}" if name != "name" else "" dtype_str += ( one_dtype.field.model.__get_name_with_module__() + field_ext + "|" ) dtype_str = dtype_str.rstrip("|") if not is_itype: dtype_str = f"cat[{dtype_str}]" return dtype_str def serialize_pandas_dtype(pandas_dtype: ExtensionDtype) -> str: """Convert pandas ExtensionDtype to simplified string representation.""" from pandas.api.types import CategoricalDtype, is_string_dtype if is_string_dtype(pandas_dtype): if not isinstance(pandas_dtype, CategoricalDtype): dtype = "str" else: dtype = "cat[ULabel]" # there are string-like categoricals and "pure" categoricals (pd.Categorical) elif isinstance(pandas_dtype, CategoricalDtype): dtype = "cat[ULabel]" else: # strip precision qualifiers dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit()) if dtype == "uint": dtype = "int" if dtype.startswith("datetime"): dtype = dtype.split("[")[0] if dtype != "cat[ULabel]": assert dtype in FEATURE_DTYPES # noqa: S101 return dtype def convert_to_pandas_dtype(lamin_dtype: str) -> str | pd.CategoricalDtype: """Convert LaminDB simplified string representation back to pandas dtype.""" from pandas.api.types import CategoricalDtype dtype_map = { "str": "string", # nullable string dtype "url": "string", # URLs are validated as strings "int": "Int64", # Nullable integer to handle missing values "num": "float64", "float": "float64", "bool": "boolean", # Nullable boolean "datetime": "datetime64[ns]", "date": "object", # preserve Date objects "dict": "object", # dicts are stored as object dtype in pandas } if lamin_dtype in dtype_map: return dtype_map[lamin_dtype] elif lamin_dtype.startswith("cat"): return CategoricalDtype() elif lamin_dtype.startswith("list"): return "object" # lists are stored as object dtype in pandas return lamin_dtype def parse_filter_string(filter_str: str) -> dict[str, tuple[str, str | None, str]]: """Parse comma-separated Django filter expressions into structured components. Args: filter_str: Comma-separated filters like 'name=value, relation__field=value' Returns: Dict mapping original filter key to (relation_name, field_name, value) tuple. For direct fields: field_name is None. For relations: field_name contains the lookup field. """ filters = {} filter_parts = [part.strip() for part in filter_str.split(",")] for part in filter_parts: if "=" not in part: raise ValueError(f"Invalid filter expression: '{part}' (missing '=' sign)") key, value = part.split("=", 1) key = key.strip() value = value.strip().strip("'\"") if not key: raise ValueError(f"Invalid filter expression: '{part}' (empty key)") if not value: raise ValueError(f"Invalid filter expression: '{part}' (empty value)") if "__" in key: relation_name, field_name = key.split("__", 1) filters[key] = (relation_name, field_name, value) else: filters[key] = (key, None, value) return filters def resolve_relation_filters( parsed_filters: dict[str, tuple[str, str | None, str]], registry: SQLRecord ) -> dict[str, str | SQLRecord]: """Resolve relation filters actual model objects. Args: parsed_filters: Django filters like output from :func:`lamindb.models.feature.parse_filter_string` registry: Model class to resolve relationships against Returns: Dict with resolved objects for successful relations, original values for direct fields and failed resolutions. """ resolved = {} for filter_key, (relation_name, field_name, value) in parsed_filters.items(): if field_name is not None: # relation filter if hasattr(registry, relation_name): relation_field = getattr(registry, relation_name) if ( hasattr(relation_field, "field") and relation_field.field.is_relation ): related_model = relation_field.field.related_model related_obj = related_model.get(**{field_name: value}) resolved[relation_name] = related_obj else: resolved[filter_key] = value return resolved def migrate_dtype_to_uid_format(connection, input_field: str = "_dtype_str") -> None: """Update _dtype_str for nested Record/ULabel types to uid format. Converts old format (name-based) dtype strings to new UID-based format. This function is used in migrations to update existing feature records. Args: connection: Database connection (from schema_editor.connection) input_field: Field name to read from ("_dtype_str" or "dtype") Returns: None. Updates are performed directly in the database. """ # Patterns to look for old format (name-based) patterns = [ "cat[Record[", "cat[ULabel[", "list[cat[Record[", "list[cat[ULabel[", ] # Build SQL query to fetch features matching any pattern # Using OR conditions for each pattern pattern_conditions = " OR ".join( [f"{input_field} LIKE '{pattern}%'" for pattern in patterns] ) query = f""" SELECT id, uid, name, {input_field} FROM lamindb_feature WHERE {pattern_conditions} """ # Fetch matching features with connection.cursor() as cursor: cursor.execute(query) columns = [col[0] for col in cursor.description] features = [dict(zip(columns, row)) for row in cursor.fetchall()] # Convert each feature for feature in features: try: # Convert old format string to objects, then serialize to UID format dtype_objects = dtype_as_object(feature[input_field], old_format=True) new_dtype_str = serialize_dtype(dtype_objects) if new_dtype_str != feature[input_field]: # Update using raw SQL update_query = """ UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s """ with connection.cursor() as cursor: cursor.execute(update_query, [new_dtype_str, feature["id"]]) except Exception as e: # If conversion fails, keep the original value print( f"Warning: Could not convert dtype for feature {feature['name']} ({feature['uid']}) because of error: {e}" ) continue def process_init_feature_param(args, kwargs): # now we proceed with the user-facing constructor if len(args) != 0: raise ValueError("Only keyword args allowed") name: str = kwargs.pop("name", None) dtype: type | str | None = kwargs.pop("dtype", None) is_type: bool = kwargs.pop("is_type", False) type_: Feature | str | None = kwargs.pop("type", None) description: str | None = kwargs.pop("description", None) branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) _skip_validation = kwargs.pop("_skip_validation", False) if kwargs: valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)]) raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments") kwargs["name"] = name kwargs["type"] = type_ kwargs["is_type"] = is_type kwargs["branch"] = branch kwargs["branch_id"] = branch_id kwargs["space"] = space kwargs["space_id"] = space_id kwargs["_skip_validation"] = _skip_validation kwargs["description"] = description # cast dtype if dtype is None and not is_type: raise ValidationError( f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype" ) dtype_str = None if dtype is not None: if not isinstance(dtype, str): dtype_str = serialize_dtype(dtype) elif dtype in {"num", "path", "url"}: dtype_str = dtype else: logger.warning( f"rather than passing a string '{dtype}' to dtype, consider passing a Python object" ) dtype_str = dtype parse_dtype(dtype_str, check_exists=True, old_format=True) if dtype_str.startswith( ("cat[Record[", "cat[ULabel[", "list[cat[Record[", "list[cat[ULabel[") ): # need to convert from old semantic format to new uid-based format dtype_str = serialize_dtype(dtype_as_object(dtype_str, old_format=True)) kwargs["_dtype_str"] = dtype_str return kwargs UPDATE_FEATURE_ON_NAME_CHANGE = """\ DECLARE old_renamed JSONB; new_renamed JSONB; ts TEXT; BEGIN -- Only proceed if name actually changed IF OLD.name IS DISTINCT FROM NEW.name THEN -- Update synonyms IF NEW.synonyms IS NULL OR NEW.synonyms = '' THEN NEW.synonyms := OLD.name; ELSIF position(OLD.name in NEW.synonyms) = 0 THEN NEW.synonyms := NEW.synonyms || '|' || OLD.name; END IF; -- Update _aux with rename history ts := TO_CHAR(NOW() AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS"Z"'); -- Get existing renamed history or initialize empty object old_renamed := COALESCE((OLD._aux->>'renamed')::JSONB, '{}'::JSONB); -- Add old name with timestamp new_renamed := old_renamed || jsonb_build_object(ts, OLD.name); -- Update _aux with new renamed history IF NEW._aux IS NULL THEN NEW._aux := jsonb_build_object('renamed', new_renamed); ELSE NEW._aux := NEW._aux || jsonb_build_object('renamed', new_renamed); END IF; END IF; RETURN NEW; END; """ class Feature(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates): """Measurable properties such as dataframe columns or record fields. Features represent *what* is measured in a dataset—the variables or dimensions along which data is organized. They enable you to query datasets based on their structure and corresponding label annotations. Args: name: `str` Name of the feature, typically a column name. dtype: `type | ULabel | Record | DtypeStr | Registry | list[Registry] | FieldAttr` Types or `ULabel` or `Record` objects representing types. See :class:`~lamindb.base.types.DtypeStr`. type: `Feature | None = None` A feature type, see :attr:`~lamindb.Feature.type`. is_type: `bool = False` Whether this feature is a type, see :attr:`~lamindb.Feature.is_type`. unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc. description: `str | None = None` A description. synonyms: `str | None = None` Bar-separated synonyms. nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`. default_value: `Any | None = None` Default value for the feature. coerce: `bool | None = None` When `True`, attempts to coerce values to the specified dtype during validation, see :attr:`~lamindb.Feature.coerce`. Defaults to `False` unless `is_type` is `True`. cat_filters: `dict[str, str | SQLRecord] | None = None` Subset a registry by additional filters to define valid categories. Note: For more control, you can use :mod:`bionty` registries to manage simple biological entities like genes, proteins & cell markers. Or you define custom registries to manage high-level derived features like gene sets. See Also: :meth:`~lamindb.Feature.from_dataframe` Create feature records from DataFrame. :attr:`~lamindb.Artifact.features` Feature manager of an artifact or collection. :class:`~lamindb.ULabel` Universal labels. :class:`~lamindb.Schema` Sets of features. Example: Features with simple data types:: ln.Feature(name="sample_note", dtype=str).save() ln.Feature(name="temperature_in_celsius", dtype=float).save() ln.Feature(name="read_count", dtype=int).save() A categorical feature measuring labels managed in the `ULabel` registry:: ln.Feature(name="sample", dtype=ln.ULabel).save() Restrict a categorical feature to a specific `ULabel` type:: perturbation = ln.ULabel(name="Perturbation", is_type=True).save() ln.Feature(name="perturbation", dtype=perturbation).save() Restrict a categorical feature to a specific `Record` type:: experiment = ln.Record(name="Experiment", is_type=True).save() ln.Feature(name="experiment", dtype=experiment).save() Restrict a categorical feature to the `bt.CellType` registry:: ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() # expert annotation ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save() # model annotation .. admonition:: Categoricals define relationships. In LaminDB, **categoricals** define **relationships**. For example, with dtype set to a `ULabel` type, setting a feature value relates the object to a `ULabel` of that type. Scope a feature with a **feature type** to distinguish the same feature name across different contexts:: abc_feature_type = ln.Feature(name="ABC", is_type=True).save() # ABC could reference a schema, a project, a team, etc. ln.Feature(name="concentration_nM", dtype=float, type=abc_feature_type).save() xyz_feature_type = ln.Feature(name="XYZ", is_type=True).save() # XYZ could reference a schema, a project, a team, etc. ln.Feature(name="concentration_nM", dtype=float, type=xyz_feature_type).save() # calling .save() again with the same name and type returns the existing feature ln.Feature(name="concentration_nM", dtype=float, type=xyz_feature_type).save() Annotate an artifact with features (works identically for records and runs):: artifact.features.set_values({ "temperature_in_celsius": 37.5, "sample_note": "Control sample", }) Query artifacts/records/runs by features:: ln.Artifact.filter(features__name="temperature_in_celsius") # artifacts with this feature ln.Artifact.filter(temperature_in_celsius__gt=37) # artifacts where temperature > 37 Disambiguate duplicate feature names by querying with a `Feature` object:: feature = ln.Feature.get(name="my_ambig_name", type__name="my_feature_type") ln.Artifact.filter(feature == "hello") # instead of my_ambig_name="hello" A list dtype:: ln.Feature( name="cell_types", dtype=list[bt.CellType], # or list[str] for a list of strings ).save() A path feature:: ln.Feature( name="image_path", dtype="path", # will be validated as `str` ).save() Restrict categories via filters:: # restrict diseases to those matching a specific ontology version source = bt.Source.get(name="My ontology") # a registry for ontology versions ln.Feature( name="disease", dtype=bt.Disease, cat_filters={"source": source}, ).save() # restrict artifacts to those matching a specific schema schema = ln.Schema.get(name="my-schema") ln.Feature( name="valid_artifact", dtype=ln.Artifact, cat_filters={"schema": schema}, ).save() A feature accepting multiple categorical types - a union type:: ln.Feature( name="cell_types", dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]" ).save() .. dropdown:: What is the difference between features and labels? 1. A feature qualifies what is measured, i.e., a numerical or categorical random variable 2. A label *is* a measured value of a categorical variable, i.e., a category Example: When annotating a dataset that measures expression of 30k genes, the gene identifiers serve as feature identifiers, and the features are expression measurements for these genes. When annotating a dataset whose experiment knocked out 3 specific genes, those genes serve as labels of the dataset. Re-shaping data can introduce ambiguity among features & labels. If this happened, ask yourself what the joint measurement was: a feature qualifies variables in a joint measurement. The canonical data matrix lists jointly measured variables in the columns. """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" if ( django_settings.DATABASES.get("default", {}).get("ENGINE") == "django.db.backends.postgresql" ): triggers = [ pgtrigger.Trigger( name="update_feature_on_name_change", operation=pgtrigger.Update, when=pgtrigger.Before, condition=pgtrigger.Condition("OLD.name IS DISTINCT FROM NEW.name"), func=UPDATE_FEATURE_ON_NAME_CHANGE, ), ] constraints = [ models.CheckConstraint( condition=models.Q(is_type=True) | models.Q(_dtype_str__isnull=False), name="feature_dtype_str_not_null_when_is_type_false", ), # also see raw SQL constraints for `is_type` and `type` FK validity in migrations ] # Keep Django model hash/equality semantics for model identity use-cases. __hash__ = SQLRecord.__hash__ _name_field: str = "name" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=12, default=base62_12 ) """Universal id, valid across DB instances.""" name: str = CharField(max_length=150, db_index=True) """Name of feature.""" _dtype_str: DtypeStr | str | None = CharField(db_index=True, null=True) """The string-serialized data type (:class:`~lamindb.base.types.DtypeStr`). Note that mutating this field currently does not trigger re-validation of existing values. """ type: Feature | None = ForeignKey( "self", PROTECT, null=True, related_name="features" ) """Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction'). Allows to group features by type, e.g., all read outs, all metrics, etc. """ features: Feature """Features of this type (can only be non-empty if `is_type` is `True`).""" unit: str | None = CharField(max_length=30, db_index=True, null=True) """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional).""" description: str | None = TextField(null=True) """A description.""" array_rank: int = models.SmallIntegerField(default=0, db_index=True) """Rank of feature. Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix. Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with the dimension of the feature space. """ array_size: int = models.IntegerField(default=0, db_index=True) """Number of elements of the feature. Total number of elements (product of shape components) of the array. - A number or string (a scalar): 1 or `None` - A 50-dimensional embedding: 50 - A 25 x 25 image: 625 """ array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True) """Shape of the feature. - A number or string (a scalar): [1] or `None` - A 50-dimensional embedding: [50] - A 25 x 25 image: [25, 25] Is stored as a list rather than a tuple because it's serialized as JSON. """ synonyms: str | None = TextField(null=True) """Bar-separated (|) synonyms (optional).""" default_value: Any | None = JSONField(null=True, default=None) """A default value that overwrites missing values during standardization.""" nullable: bool | None = BooleanField(null=True, default=None) """Whether the feature can have nullable values. None for type-like features.""" coerce: bool | None = BooleanField(null=True, default=None) """Whether dtypes should be coerced during validation. None for type-like features.""" # we define the below ManyToMany on the Feature model because it parallels # how other registries (like Gene, Protein, etc.) relate to Schema schemas: RelatedManager[Schema] = models.ManyToManyField( "Schema", through="SchemaFeature", related_name="features" ) """Schemas linked to this feature.""" values: RelatedManager[JsonValue] """Values for this feature.""" projects: RelatedManager[Project] """Annotating projects.""" ablocks: RelatedManager[FeatureBlock] """Attached blocks ← :attr:`~lamindb.FeatureBlock.feature`.""" @overload def __init__( self, name: str, dtype: DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr, type: Feature | None = None, is_type: bool = False, unit: str | None = None, description: str | None = None, synonyms: str | None = None, nullable: bool | None = None, default_value: Any | None = None, coerce: bool | None = None, cat_filters: dict[str, str] | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None default_value = kwargs.pop("default_value", None) nullable = kwargs.pop("nullable", None) # Default nullable to True for non-type features is_type = kwargs.get("is_type", False) if nullable is None and not is_type: nullable = True cat_filters = kwargs.pop("cat_filters", None) if "coerce_dtype" in kwargs: warnings.warn( "`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.", DeprecationWarning, stacklevel=2, ) coerce = kwargs.pop("coerce_dtype") else: coerce = kwargs.pop("coerce", None) kwargs = process_init_feature_param(args, kwargs) super().__init__(*args, **kwargs) self.default_value = default_value self.nullable = nullable self.coerce = coerce dtype_str = kwargs.pop("_dtype_str", None) if dtype_str == "cat": warnings.warn( "dtype `cat` is deprecated and will be removed in the future - " "please use `ln.Record` or `ln.ULabel` instead", DeprecationWarning, stacklevel=2, ) if cat_filters: if "|" in dtype_str: raise ValidationError( f"cat_filters are incompatible with union dtypes: '{dtype_str}'" ) if "]]" in dtype_str: raise ValidationError( f"cat_filters are incompatible with nested dtypes: '{dtype_str}'" ) # Validate filter values and SQLRecord attributes for filter_key, filter_value in cat_filters.items(): if not filter_value or ( isinstance(filter_value, str) and not filter_value.strip() ): raise ValidationError(f"Empty value in filter {filter_key}") # Check SQLRecord attributes for relation lookups if isinstance(filter_value, SQLRecord) and "__" in filter_key: field_name = filter_key.split("__", 1)[1] if not hasattr(filter_value, field_name): raise ValidationError( f"SQLRecord {filter_value.__class__.__name__} has no attribute '{field_name}' in filter {filter_key}" ) # If a SQLRecord is passed, we access its uid to apply a standard filter cat_filters = { f"{key}__uid" if ( is_sqlrecord := isinstance(filter, SQLRecord) and hasattr(filter, "uid") ) else key: filter.uid if is_sqlrecord else filter for key, filter in cat_filters.items() } fill_in = ", ".join( f"{key}='{value}'" for (key, value) in cat_filters.items() ) dtype_str = dtype_str.replace("]", f"[{fill_in}]]") self._dtype_str = dtype_str if not self._state.adding: if self._dtype_str != dtype_str: raise ValidationError( f"Feature {self.name} already exists with dtype {self._dtype_str}, you passed {dtype_str}" ) def __eq__(self, other: object) -> bool: # Preserve model identity semantics only for Feature-to-Feature comparisons. if isinstance(other, Feature): return super().__eq__(other) # Runtime returns a predicate object for query composition. # Cast keeps mypy-compatible override with object.__eq__ -> bool. return cast(bool, FeaturePredicate(self, "", other)) def __ne__(self, other: object) -> bool: # Preserve model identity semantics only for Feature-to-Feature comparisons. if isinstance(other, Feature): return not super().__eq__(other) # Runtime returns a predicate object for query composition. # Cast keeps mypy-compatible override with object.__ne__ -> bool. return cast(bool, FeaturePredicate(self, "__ne", other)) def __gt__(self, value: Any) -> FeaturePredicate: return FeaturePredicate(self, "__gt", value) def __ge__(self, value: Any) -> FeaturePredicate: return FeaturePredicate(self, "__gte", value) def __lt__(self, value: Any) -> FeaturePredicate: return FeaturePredicate(self, "__lt", value) def __le__(self, value: Any) -> FeaturePredicate: return FeaturePredicate(self, "__lte", value) # manually sync this docstring across all other children of HasType def query_features(self) -> QuerySet: """Query features of sub types. While `.features` retrieves the features with the current type, this method also retrieves sub types and the features with sub types of the current type. """ return _query_relatives([self], "features") # type: ignore @classmethod def from_dataframe( cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False ) -> SQLRecordList: """Create Feature records for dataframe columns. Args: df: Source DataFrame to extract column information from field: FieldAttr for Feature model validation, defaults to Feature.name mute: Whether to mute Feature creation similar names found warnings """ from lamindb.models import ULabel field = Feature.name if field is None else field registry = field.field.model # type: ignore if registry != Feature: raise ValueError("field must be a Feature FieldAttr!") categoricals = categoricals_from_df(df) dtypes: dict[str, type | SQLRecord | FieldAttr] = {} for name, col in df.items(): if name in categoricals: dtypes[name] = ULabel else: dtype_str = serialize_pandas_dtype(col.dtype) dtypes[name] = dtype_as_object(dtype_str) if mute: original_verbosity = logger._verbosity logger.set_verbosity(0) try: features = [ Feature(name=name, dtype=dtype) for name, dtype in dtypes.items() ] # type: ignore assert len(features) == len(df.columns) # noqa: S101 return SQLRecordList(features) finally: if mute: logger.set_verbosity(original_verbosity) @classmethod @deprecated("from_dataframe") def from_df( cls, df: pd.DataFrame, field: FieldAttr | None = None, *, mute: bool = False ) -> SQLRecordList: return cls.from_dataframe(df, field, mute=mute) @classmethod def from_dict( cls, dictionary: dict[str, Any], field: FieldAttr | None = None, *, type: Feature | None = None, mute: bool = False, ) -> SQLRecordList: """Create Feature records for dictionary keys. Args: dictionary: Source dictionary to extract key information from field: FieldAttr for Feature model validation, defaults to `Feature.name` type: Feature type of all created features mute: Whether to mute dtype inference and feature creation warnings """ from lamindb.models._feature_manager import infer_convert_dtype_key_value field = Feature.name if field is None else field registry = field.field.model # type: ignore if registry != Feature: raise ValueError("field must be a Feature FieldAttr!") dtypes = {} for key, value in dictionary.items(): dtype, _, message = infer_convert_dtype_key_value(key, value, mute=mute) if dtype == "cat ? str": dtype = "str" elif dtype == "list[cat ? str]": dtype = "list[str]" dtypes[key] = dtype if mute: original_verbosity = logger._verbosity logger.set_verbosity(0) try: features = [ Feature(name=key, dtype=dtype, type=type) for key, dtype in dtypes.items() ] # type: ignore assert len(features) == len(dictionary) # noqa: S101 return SQLRecordList(features) finally: if mute: logger.set_verbosity(original_verbosity) def save(self, *args, **kwargs) -> Feature: """Save the feature to the instance.""" super().save(*args, **kwargs) return self def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]: """Pass addtional configurations to the schema.""" if optional is not None: return self, {"optional": optional} return self, {} @property @deprecated("coerce") def coerce_dtype(self) -> bool | None: """Alias for coerce (backward compatibility).""" return self.coerce @coerce_dtype.setter def coerce_dtype(self, value: bool | None) -> None: self.coerce = value @property @deprecated("dtype_as_str") def dtype(self) -> str | None: """The `dtype` as a string.""" if self._dtype_str is None: return None if self._dtype_str.startswith( ("cat[Record[", "cat[ULabel[", "list[cat[Record[", "list[cat[ULabel[") ): if self._dtype_str.startswith("list["): dtype_str = self._dtype_str.replace("list[", "")[:-1] else: dtype_str = self._dtype_str record_object = dtype_as_object(dtype_str) nested_string = f"[{record_object.name}]" # type: ignore for t in record_object.query_types(): # type: ignore nested_string = f"[{t.name}{nested_string}]" return self._dtype_str.replace(f"[{record_object.uid}]", nested_string) # type: ignore else: return self._dtype_str @property def dtype_as_str(self) -> DtypeStr | str | None: """The `dtype` as a string. You can query by this property as if it was a string field. The query is delegated to the private `_dtype_str` field. Is `None` if `Feature` if `is_type=True`, otherwise a string. Examples: Query by `dtype_as_str`:: ln.Feature.filter(dtype_as_str="float").to_dataframe() Examples for `dtype_as_str`:: feature_float = ln.Feature(name="measurement", dtype=float).save() assert feature_float.dtype_as_str == "float" sample_type = bt.Record(name="Sample", is_type=True).save() feature_sample = ln.Feature(name="sample", dtype=sample_type).save() assert feature_sample.dtype_as_str == "cat[Record[12345678abcdeFGHI]] # uid of type record feature_list_float = ln.Feature(name="numbers", dtype=list[float]).save() assert feature_list_float.dtype_as_str == "list[float]" feature_ulabel = ln.Feature(name="sample", dtype=ln.ULabel).save() assert feature_ulabel.dtype_as_str == "cat[ULabel]" feature_record = ln.Feature(name="sample", dtype=bt.CellLine).save() assert feature_record.dtype_as_str == "cat[bionty.CellLine]" feature_list_record = ln.Feature(name="cell_types", dtype=list[bt.CellLine]).save() assert feature_list_record.dtype_as_str == "list[cat[bionty.CellLine]]" """ return self._dtype_str @property def dtype_as_object(self) -> type | SQLRecord | FieldAttr | None: # type: ignore """The `dtype` as an object. Example: For simple dtypes, returns the built-in Python type:: feature_float = ln.Feature(name="measurement", dtype=float).save() assert feature_float.dtype_as_object is float For features with with `Record` or `ULabel` types, returns the `Record` or `ULabel` object:: sample_type = bt.Record(name="Sample", is_type=True).save() feature_sample = ln.Feature(name="sample", dtype=sample_type).save() assert feature_sample.dtype_as_object == sample_type For features with `Registry` types, returns the `Registry` object or a field (`DeferredAttribute`) object:: feature_cell_type = ln.Feature(name="cell_type_name", dtype=bt.CellType).save() assert feature_cell_type.dtype_as_object == bt.CellType feature_ontology_id = ln.Feature(name="ontology_id", dtype=bt.CellType.ontology_id).save() assert feature_ontology_id.dtype_as_object == bt.CellType.ontology_id """ return dtype_as_object(self._dtype_str) # we'll enable this later # @property # def observational_unit(self) -> Literal["Artifact", "Observation"]: # """Default observational unit on which the feature is measured. # Currently, we only make a distinction between artifact-level and observation-level features. # For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level. # When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`. # However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So, # `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`. # The value of `observational_unit` is currently auto-managed: if using `artifact.features.set_values()`, # it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot # (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level). # Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.). # """ # if self._expect_many: # return "Observation" # this here might be replaced with the specific observational unit # else: # return "Artifact" class JsonValue(SQLRecord, TracksRun): """JSON values for annotating artifacts and runs. Categorical values are stored in their respective registries: :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc. Unlike for `ULabel`, in `JsonValue`, values are grouped by features and not by an ontological hierarchy. """ # we do not have a unique constraint on feature & value because it leads to hashing errors # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000 # we do not hash values because we have `get_or_create` logic all over the place # and also for checking whether the (feature, value) combination exists # there does not seem an issue with querying for a dict-like value # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001 _name_field: str = "value" feature: Feature | None = ForeignKey( Feature, CASCADE, null=True, related_name="values", default=None ) """The dimension metadata.""" value: Any = models.JSONField() """The JSON-like value.""" hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True) """Value hash.""" artifacts: Artifact """Artifacts annotated with this feature value.""" runs: Run """Runs annotated with this feature value.""" class Meta(BaseSQLRecord.Meta, TracksRun.Meta): app_label = "lamindb" unique_together = ("feature", "hash") @classmethod def get_or_create(cls, feature, value): # simple values: (int, float, str, bool, datetime) if not isinstance(value, dict): hash = hash_string(str(value)) else: hash = hash_dict(value) try: return ( cls.objects.create(feature=feature, value=value, hash=hash), False, ) except DjangoIntegrityError: return cls.objects.get(feature=feature, hash=hash), True def suggest_categorical_for_str_iterable( iterable: Iterable[str], key: str = None ) -> str: import pandas as pd c = pd.Categorical(iterable) message = "" if len(c.categories) < len(c): if key != "": key_note = f" for feature {key}" else: key_note = "" message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'" return message def categoricals_from_df(df: pd.DataFrame) -> dict: """Returns categorical columns.""" from pandas.api.types import CategoricalDtype, is_string_dtype string_cols = [col for col in df.columns if is_string_dtype(df[col])] categoricals = { col: df[col] for col in df.columns if isinstance(df[col].dtype, CategoricalDtype) } for key in string_cols: message = suggest_categorical_for_str_iterable(df[key], key) if message: logger.warning(message) return categoricals ================================================ FILE: lamindb/models/has_parents.py ================================================ # ruff: noqa: TC004 from __future__ import annotations import builtins from typing import TYPE_CHECKING, Literal import lamindb_setup as ln_setup from lamin_utils import logger from ..errors import ValidationError from .query_set import SQLRecordList, get_default_branch_ids from .run import Run from .sqlrecord import HasType, format_field_value, get_name_field if TYPE_CHECKING: from graphviz import Digraph from lamindb.base.types import StrField from .artifact import Artifact from .collection import Collection from .query_set import BasicQuerySet, QuerySet from .sqlrecord import SQLRecord LAMIN_GREEN_LIGHTER = "#10b981" LAMIN_GREEN_DARKER = "#065f46" TRANSFORM_VIOLET = "#eff2ff" GREEN_FILL = "honeydew" is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) # this is optimized to have fewer recursive calls # also len of QuerySet can be costly at times def _query_relatives( records: BasicQuerySet | list[HasParents], attr: Literal["children", "parents"] | str, ) -> QuerySet: branch_ids = get_default_branch_ids() if hasattr(records, "values_list"): model = records.model # type: ignore using_db = records.db # type: ignore frontier_ids = set(records.values_list("id", flat=True)) else: record = records[0] model = record.__class__ using_db = record._state.db # type: ignore frontier_ids = {r.id for r in records} # type: ignore if attr == "children": attr_filter = "parents__id__in" elif attr == "parents": attr_filter = "children__id__in" else: attr_filter = "type__id__in" seen_ids = set(frontier_ids) # copies results = set() while frontier_ids: relatives_qs = model.connect(using_db).filter( branch_id__in=branch_ids, **{attr_filter: frontier_ids} ) next_ids = set(relatives_qs.values_list("id", flat=True)) - seen_ids if not next_ids: break results.update(next_ids) seen_ids.update(next_ids) frontier_ids = next_ids return model.connect(using_db).filter(id__in=results) def keep_topmost_matches(records: list[HasType] | SQLRecordList) -> SQLRecordList: """Keep only the topmost (least specific) match.""" if not records: return SQLRecordList([]) # Group by name records_by_name: dict[str, list[HasType]] = {} for record in records: if record.name not in records_by_name: records_by_name[record.name] = [] records_by_name[record.name].append(record) # Fast path: single match per name result: SQLRecordList = SQLRecordList([]) needs_depth_computation = {} for name, name_records in records_by_name.items(): if len(name_records) == 1: result.append(name_records[0]) else: # Check if any have type_id=None (trivially topmost) root_records = [r for r in name_records if r.type_id is None] if len(root_records) == 1: result.append(root_records[0]) elif len(root_records) > 1: class_name = records[0].__class__.__name__ raise ValidationError( f"Ambiguous match for {class_name} '{name}': found {len(root_records)} " f"root-level {class_name.lower()}s" ) else: # All have type_id, need depth computation needs_depth_computation[name] = name_records # Only compute depths if necessary if needs_depth_computation: def get_depth(record): current_type = record.type depth = 1 while current_type.type_id is not None: current_type = current_type.type depth += 1 return depth for name, name_records in needs_depth_computation.items(): records_with_depth = [(r, get_depth(r)) for r in name_records] min_depth = min(depth for _, depth in records_with_depth) topmost = [r for r, depth in records_with_depth if depth == min_depth] class_name = records[0].__class__.__name__ if len(topmost) > 1: raise ValidationError( f"Ambiguous match for {class_name} '{name}': found {len(topmost)} {class_name.lower()}s " f"at depth {min_depth} (under types: {[r.type.name for r in topmost]})" ) result.append(topmost[0]) return result def _query_ancestors_of_fk(record: SQLRecord, attr: str) -> SQLRecordList: from .query_set import get_default_branch_ids branch_ids = get_default_branch_ids() ancestors = [] current = getattr(record, attr) while current is not None and current.branch_id in branch_ids: ancestors.append(current) current = getattr(current, attr) return SQLRecordList(ancestors) class HasParents: """Base class for hierarchical registries (ontologies).""" def view_parents( self, field: StrField | None = None, with_children: bool = False, distance: int = 5, ): """View parents in an ontology. Args: field: Field to display on graph with_children: Whether to also show children. distance: Maximum distance still shown. Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype). Examples: >>> import bionty as bt >>> bt.Tissue.from_source(name="subsegmental bronchus").save() >>> record = bt.Tissue.get(name="respiratory tube") >>> record.view_parents() >>> tissue.view_parents(with_children=True) """ if field is None: field = get_name_field(self) if not isinstance(field, str): field = field.field.name return view_parents( record=self, # type: ignore field=field, with_parents=True, with_children=with_children, distance=distance, ) def view_children( self, field: StrField | None = None, distance: int = 5, ): """View children in an ontology. Args: field: Field to display on graph distance: Maximum distance still shown. Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype). Examples: >>> import bionty as bt >>> bt.Tissue.from_source(name="subsegmental bronchus").save() >>> record = bt.Tissue.get(name="respiratory tube") >>> record.view_parents() >>> tissue.view_parents(with_children=True) """ if field is None: field = get_name_field(self) if not isinstance(field, str): field = field.field.name return view_parents( record=self, # type: ignore field=field, with_parents=False, with_children=True, distance=distance, ) def query_parents(self) -> QuerySet: """Query parents in an ontology.""" return _query_relatives([self], "parents") # type: ignore def query_children(self) -> QuerySet: """Query children in an ontology.""" return _query_relatives([self], "children") # type: ignore def view_digraph(u: Digraph): from graphviz.backend import ExecutableNotFound try: if is_run_from_ipython: from IPython import get_ipython from IPython.display import display # True if the code is running in a Jupyter Notebook or Lab environment if get_ipython().__class__.__name__ == "TerminalInteractiveShell": return u.view() else: # call u._repr_mimebundle_() manually that exception gets raised properly and not just printed by # call to display() display(u._repr_mimebundle_(), raw=True) else: return u.view() except (FileNotFoundError, RuntimeError, ExecutableNotFound): # pragma: no cover logger.error( "please install the graphviz executable on your system:\n - Ubuntu: `sudo" " apt-get install graphviz`\n - Windows:" " https://graphviz.org/download/#windows\n - Mac: `brew install graphviz`" ) def view_lineage( data: Artifact | Collection, with_children: bool = True, return_graph: bool = False ) -> Digraph | None: """View data lineage graph.""" if ln_setup.settings.instance.is_on_hub: instance_slug = ln_setup.settings.instance.slug ui_url = ln_setup.settings.instance.ui_url entity_slug = data.__class__.__name__.lower() logger.important( f"explore at: {ui_url}/{instance_slug}/{entity_slug}/{data.uid}" ) import graphviz df_values = _get_all_parent_runs(data) if with_children: df_values += _get_all_child_runs(data) df_edges = _df_edges_from_runs(df_values) def add_node( record: Run | Artifact | Collection, node_id: str, node_label: str, u: graphviz.Digraph, ): if isinstance(record, Run): fillcolor = TRANSFORM_VIOLET else: fillcolor = "white" u.node( node_id, label=node_label, shape="box", style="rounded,filled", fillcolor=fillcolor, ) u = graphviz.Digraph( f"{data._meta.model_name}_{data.uid}", node_attr={ "fillcolor": "white", "color": "darkgrey", "fontname": "Helvetica", "fontsize": "10", }, edge_attr={"arrowsize": "0.5"}, ) for _, row in df_edges.iterrows(): add_node(row["source_record"], row["source"], row["source_label"], u) if row["target_record"] not in df_edges["source_record"]: add_node(row["target_record"], row["target"], row["target_label"], u) u.edge(row["source"], row["target"], color="dimgrey") u.node( f"{data._meta.model_name}_{data.uid}", label=get_record_label(data), style="rounded,filled", fillcolor="white", shape="box", ) if return_graph: return u else: return view_digraph(u) def view_parents( record: SQLRecord, field: str, with_parents: bool = True, with_children: bool = False, distance: int = 100, attr_name: Literal["parents", "predecessors"] = "parents", ): """Graph of parents.""" if not hasattr(record, attr_name): raise NotImplementedError( f"Parents view is not supported for {record.__class__.__name__}!" ) import graphviz import pandas as pd df_edges = None df_edges_parents = None df_edges_children = None if with_parents: df_edges_parents = _df_edges_from_parents( record=record, field=field, distance=distance, attr_name=attr_name ) if with_children: df_edges_children = _df_edges_from_parents( record=record, field=field, distance=distance, children=True, attr_name=attr_name, ) # Rename the columns to swap source and target df_edges_children = df_edges_children.rename( columns={ "source": "temp_target", "source_label": "temp_target_label", "source_record": "temp_target_record", "target": "source", "target_label": "source_label", "target_record": "source_record", } ) df_edges_children = df_edges_children.rename( columns={ "temp_target": "target", "temp_target_label": "target_label", "temp_target_record": "target_record", } ) if df_edges_parents is not None and df_edges_children is not None: df_edges = pd.concat([df_edges_parents, df_edges_children]).drop_duplicates() elif df_edges_parents is not None: df_edges = df_edges_parents elif df_edges_children is not None: df_edges = df_edges_children else: return None u = graphviz.Digraph( record.uid, node_attr={ "color": LAMIN_GREEN_DARKER, "fillcolor": GREEN_FILL, "shape": "box", "style": "rounded,filled", "fontname": "Helvetica", "fontsize": "10", }, edge_attr={"arrowsize": "0.5"}, ) u.node( record.uid, label=(get_record_label(record)), fillcolor=LAMIN_GREEN_LIGHTER, ) if df_edges is not None: for _, row in df_edges.iterrows(): u.node(row["source"], label=row["source_label"]) u.node(row["target"], label=row["target_label"]) u.edge(row["source"], row["target"], color="dimgrey") view_digraph(u) def _get_parents( record: SQLRecord, field: str, distance: int, children: bool = False, attr_name: Literal["parents", "predecessors"] = "parents", ): """Recursively get parent records within a distance.""" if children: key = attr_name else: key = "children" if attr_name == "parents" else "successors" # type: ignore using_db = record._state.db model = record.__class__ condition = f"{key}__{field}" field_value = getattr(record, field) results = model.connect(using_db).filter(**{condition: field_value}) if distance < 2: return results d = 2 while d < distance: # this grows in the loop, # i.e. children__children__name -> children__children__children__name -> ... condition = f"{key}__{condition}" records = model.connect(using_db).filter(**{condition: field_value}) try: if not records.exists(): return results results = results | records d += 1 except Exception: # For OperationalError: # SQLite does not support joins containing more than 64 tables return results return results def _df_edges_from_parents( record: SQLRecord, field: str, distance: int, children: bool = False, attr_name: Literal["parents", "predecessors"] = "parents", ): """Construct a DataFrame of edges as the input of graphviz.Digraph.""" if attr_name == "parents": key = "children" if children else "parents" else: key = "successors" if children else "predecessors" parents = _get_parents( record=record, field=field, distance=distance, children=children, attr_name=attr_name, ) using_db = record._state.db all = record.__class__.objects.using(using_db) records = parents | all.filter(id=record.id) df = records.distinct().to_dataframe(include=[f"{key}__id"]) if f"{key}__id" not in df.columns: return None df_edges = df[[f"{key}__id"]] df_edges = df_edges.explode(f"{key}__id") df_edges.index.name = "target" df_edges = df_edges.reset_index() df_edges.dropna(axis=0, inplace=True) df_edges.rename(columns={f"{key}__id": "source"}, inplace=True) df_edges = df_edges.drop_duplicates() # colons messes with the node formatting: # https://graphviz.readthedocs.io/en/stable/node_ports.html df_edges["source_record"] = df_edges["source"].apply(lambda x: all.get(id=x)) df_edges["target_record"] = df_edges["target"].apply(lambda x: all.get(id=x)) if record.__class__.__name__ == "Transform": df_edges["source_label"] = df_edges["source_record"].apply(get_record_label) df_edges["target_label"] = df_edges["target_record"].apply(get_record_label) else: df_edges["source_label"] = df_edges["source_record"].apply( lambda x: get_record_label(x, field) ) df_edges["target_label"] = df_edges["target_record"].apply( lambda x: get_record_label(x, field) ) df_edges["source"] = df_edges["source_record"].apply(lambda x: x.uid) df_edges["target"] = df_edges["target_record"].apply(lambda x: x.uid) return df_edges def get_record_label(record: SQLRecord, field: str | None = None): from .artifact import Artifact from .collection import Collection from .transform import Transform if isinstance(record, (Artifact, Collection, Transform)): title = ( record.key.replace("&", "&") if record.key is not None else record.uid ) return rf"<{title}>" elif isinstance(record, Run): title = record.transform.key.replace("&", "&") if record.entrypoint is not None: title += f": {record.entrypoint}" return ( rf'<{title}
' rf"run at {format_field_value(record.started_at)}>" ) else: if field is None: field = get_name_field(record) title = record.__getattribute__(field) return rf"<{title}>" def _get_all_parent_runs(data: Artifact | Collection) -> list: """Get all input file/collection runs recursively.""" name = data._meta.model_name run_inputs_outputs = [] runs = [data.run] if data.run is not None else [] while len(runs) > 0: inputs = [] for r in runs: inputs_run = ( r.__getattribute__(f"input_{name}s") .all() .filter(branch_id__in=[0, 1]) .to_list() ) if name == "artifact": inputs_run += ( r.input_collections.all().filter(branch_id__in=[0, 1]).to_list() ) outputs_run = ( r.__getattribute__(f"output_{name}s") .all() .filter(branch_id__in=[0, 1]) .to_list() ) if name == "artifact": outputs_run += ( r.output_collections.all().filter(branch_id__in=[0, 1]).to_list() ) # if inputs are outputs artifacts are the same, will result infinite loop # so only show as outputs overlap = set(inputs_run).intersection(outputs_run) if overlap: logger.warning( f"The following artifacts are both inputs and outputs of Run(uid={r.uid}): {overlap}\n → Only showing as outputs." ) inputs_run = list(set(inputs_run) - overlap) if len(inputs_run) > 0: run_inputs_outputs += [(inputs_run, r)] if len(outputs_run) > 0: run_inputs_outputs += [(r, outputs_run)] inputs += inputs_run runs = [f.run for f in inputs if f.run is not None] return run_inputs_outputs def _get_all_child_runs(data: Artifact | Collection) -> list: """Get all output file/collection runs recursively.""" name = data._meta.model_name all_runs: set[Run] = set() run_inputs_outputs = [] if data.run is not None: runs = {f.run for f in data.run.__getattribute__(f"output_{name}s").all()} else: runs = set() if name == "artifact" and data.run is not None: runs.update( { f.run for f in data.run.output_collections.all().filter(branch_id__in=[0, 1]) } ) while runs.difference(all_runs): all_runs.update(runs) child_runs: set[Run] = set() for r in runs: inputs_run = ( r.__getattribute__(f"input_{name}s") .all() .filter(branch_id__in=[0, 1]) .to_list() ) if name == "artifact": inputs_run += ( r.input_collections.all().filter(branch_id__in=[0, 1]).to_list() ) run_inputs_outputs += [(inputs_run, r)] outputs_run = ( r.__getattribute__(f"output_{name}s") .all() .filter(branch_id__in=[0, 1]) .to_list() ) if name == "artifact": outputs_run += ( r.output_collections.all().filter(branch_id__in=[0, 1]).to_list() ) run_inputs_outputs += [(r, outputs_run)] child_runs.update( Run.filter( # type: ignore **{f"input_{name}s__uid__in": [i.uid for i in outputs_run]} ).to_list() ) # for artifacts, also include collections in the lineage if name == "artifact": child_runs.update( Run.filter( # type: ignore input_collections__uid__in=[i.uid for i in outputs_run] ).to_list() ) runs = child_runs return run_inputs_outputs def _df_edges_from_runs(df_values: list): import pandas as pd df = pd.DataFrame(df_values, columns=["source_record", "target_record"]) df = df.explode("source_record") df = df.explode("target_record") df = df.drop_duplicates().dropna() df["source"] = [f"{i._meta.model_name}_{i.uid}" for i in df["source_record"]] df["target"] = [f"{i._meta.model_name}_{i.uid}" for i in df["target_record"]] df["source_label"] = df["source_record"].apply(get_record_label) df["target_label"] = df["target_record"].apply(get_record_label) return df ================================================ FILE: lamindb/models/project.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, overload from django.core.validators import RegexValidator from django.db import models from django.db.models import CASCADE, PROTECT from lamindb.base.fields import ( BigIntegerField, CharField, DateField, DateTimeField, ForeignKey, TextField, URLField, ) from lamindb.base.users import current_user_id from ..base.uids import base62_12 from .artifact import Artifact from .can_curate import CanCurate from .collection import Collection from .feature import Feature from .has_parents import _query_relatives from .record import Record from .run import Run, TracksRun, TracksUpdates, User from .schema import Schema from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, ValidateFields from .transform import Transform from .ulabel import ULabel if TYPE_CHECKING: from datetime import date as DateType from datetime import datetime from .block import Block, ProjectBlock from .query_manager import RelatedManager from .query_set import QuerySet from .sqlrecord import Branch class Reference( SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields ): """References such as internal studies, papers, documents, or URLs. Example: Create a reference object:: reference = Reference( name="A Paper Title", abbr="APT", url="https://doi.org/10.1000/xyz123", pubmed_id=12345678, doi="10.1000/xyz123", description="Good paper.", text="Some text I want to be searchable.", date=date(2023, 11, 21), ).save() """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" # also see raw SQL constraints for `is_type` and `type` FK validity in migrations id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, max_length=12, db_index=True, default=base62_12 ) """Universal id, valid across DB instances.""" name: str = CharField(db_index=True) """Title or name of the reference document.""" description: str | None = TextField(null=True) """A description.""" type: Reference | None = ForeignKey( "self", PROTECT, null=True, related_name="references" ) """Type of reference (e.g., 'Study', 'Paper', 'Preprint') ← :attr:`~lamindb.Reference.references`. Allows to group reference by type, e.g., internal studies vs. all papers etc. """ references: RelatedManager[Reference] """References of this type (can only be non-empty if `is_type` is `True`).""" abbr: str | None = CharField( max_length=32, db_index=True, null=True, ) """An abbreviation for the reference.""" url: str | None = URLField(null=True, db_index=True) """URL linking to the reference.""" pubmed_id: int | None = BigIntegerField(null=True, db_index=True) """A PudMmed ID.""" doi: str | None = CharField( null=True, db_index=True, validators=[ RegexValidator( regex=r"^(?:https?://(?:dx\.)?doi\.org/|doi:|DOI:)?10\.\d+/.*$", message="Must be a DOI (e.g., 10.1000/xyz123 or https://doi.org/10.1000/xyz123)", ) ], ) """Digital Object Identifier (DOI) for the reference.""" text: str | None = TextField(null=True) """Abstract or full text of the reference to make it searchable.""" date: DateType | None = DateField(null=True, default=None) """Date of creation or publication of the reference.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( Artifact, through="ArtifactReference", related_name="references" ) """Annotated artifacts ← :attr:`~lamindb.Artifact.references`.""" transforms: RelatedManager[Transform] = models.ManyToManyField( Transform, through="TransformReference", related_name="references" ) """Annotated transforms ← :attr:`~lamindb.Transform.references`.""" collections: RelatedManager[Collection] = models.ManyToManyField( Collection, through="CollectionReference", related_name="references" ) """Annotated collections ← :attr:`~lamindb.Collection.references`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( Record, through="RecordReference", related_name="linked_references" ) """Linked in records ← :attr:`~lamindb.Record.linked_references`.""" records: RelatedManager[Record] = models.ManyToManyField( Record, through="ReferenceRecord", related_name="references" ) """Annotated records ← :attr:`~lamindb.Record.references`.""" projects: RelatedManager[Project] """Projects that annotate this reference ← :attr:`~lamindb.Project.references`.""" @overload def __init__( self, name: str, type: Reference | None = None, is_type: bool = False, abbr: str | None = None, url: str | None = None, pubmed_id: int | None = None, doi: str | None = None, description: str | None = None, text: str | None = None, date: DateType | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def query_references(self) -> QuerySet: """Query references of sub types. While `.references` retrieves the references with the current type, this method also retrieves sub types and the references with sub types of the current type. """ return _query_relatives([self], "references") # type: ignore class Project(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates, ValidateFields): """Projects to label artifacts, transforms, records, and runs. Example: Create a project and annotate an artifact with it:: project = Project( name="My Project Name", abbr="MPN", url="https://example.com/my_project", ).save() artifact.projects.add(project) # <-- labels the artifact with the project ln.track(project=project) # <-- automtically labels entities during the run """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" # also see raw SQL constraints for `is_type` and `type` FK validity in migrations id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, max_length=12, db_index=True, default=base62_12 ) """Universal id, valid across DB instances.""" name: str = CharField(db_index=True) """Title or name of the Project.""" description: str | None = TextField(null=True) """A description.""" type: Project | None = ForeignKey( "self", PROTECT, null=True, related_name="projects" ) """Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task') ← :attr:`~lamindb.Project.projects`.""" projects: RelatedManager[Project] """Projects of this type (can only be non-empty if `is_type` is `True`).""" abbr: str | None = CharField(max_length=32, db_index=True, null=True) """An abbreviation.""" url: str | None = URLField(max_length=255, null=True, default=None) """A URL.""" start_date: DateType | None = DateField(null=True, default=None) """Date of start of the project.""" end_date: DateType | None = DateField(null=True, default=None) """Date of start of the project.""" parents: RelatedManager[Project] = models.ManyToManyField( "self", symmetrical=False, related_name="children" ) """Parent projects, the super-projects owning this project ← :attr:`~lamindb.Project.children`.""" children: RelatedManager[Project] """Child projects, the sub-projects owned by this project. Reverse accessor for `.parents`. """ predecessors: RelatedManager[Project] = models.ManyToManyField( "self", symmetrical=False, related_name="successors" ) """The preceding projects required by this project ← :attr:`~lamindb.Project.successors`.""" successors: RelatedManager[Project] """The succeeding projects requiring this project. Reverse accessor for `.predecessors`. """ artifacts: RelatedManager[Artifact] = models.ManyToManyField( Artifact, through="ArtifactProject", related_name="projects" ) """Annotated artifacts ← :attr:`~lamindb.Artifact.projects`.""" transforms: RelatedManager[Transform] = models.ManyToManyField( Transform, through="TransformProject", related_name="projects" ) """Annotated transforms ← :attr:`~lamindb.Transform.projects`.""" runs: RelatedManager[Run] = models.ManyToManyField( Run, through="RunProject", related_name="projects" ) """Annotated runs ← :attr:`~lamindb.Run.projects`.""" ulabels: RelatedManager[ULabel] = models.ManyToManyField( ULabel, through="ULabelProject", related_name="projects" ) """Annotated ulabels ← :attr:`~lamindb.ULabel.projects`.""" features: RelatedManager[Feature] = models.ManyToManyField( Feature, through="FeatureProject", related_name="projects" ) """Annotated features ← :attr:`~lamindb.Feature.projects`.""" schemas: RelatedManager[Schema] = models.ManyToManyField( Schema, through="SchemaProject", related_name="projects" ) """Annotated schemas ← :attr:`~lamindb.Schema.projects`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( Record, through="RecordProject", related_name="linked_projects" ) """Linked in records ← :attr:`~lamindb.Record.linked_projects`.""" records: RelatedManager[Record] = models.ManyToManyField( Record, through="ProjectRecord", related_name="projects" ) """Annotated records ← :attr:`~lamindb.Record.projects`.""" collections: RelatedManager[Collection] = models.ManyToManyField( Collection, through="CollectionProject", related_name="projects" ) """Annotated collections ← :attr:`~lamindb.Collection.projects`.""" references: RelatedManager[Reference] = models.ManyToManyField( "Reference", related_name="projects" ) """Annotated references ← :attr:`~lamindb.Reference.projects`.""" blocks: RelatedManager[Block] = models.ManyToManyField( "Block", through="BlockProject", related_name="projects" ) """Annotated blocks ← :attr:`~lamindb.Block.projects`.""" users: RelatedManager[User] = models.ManyToManyField( "User", through="ProjectUser", related_name="projects", ) """Users participating in this project ← :attr:`~lamindb.ProjectUser.user`.""" branches: RelatedManager[Branch] """Annotated branches ← :attr:`~lamindb.BranchProject.project`.""" _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True) """Status code.""" ablocks: RelatedManager[ProjectBlock] """Attached blocks ← :attr:`~lamindb.ProjectBlock.project`.""" @overload def __init__( self, name: str, type: Project | None = None, is_type: bool = False, abbr: str | None = None, url: str | None = None, start_date: DateType | None = None, end_date: DateType | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def query_projects(self) -> QuerySet: """Query projects of sub types. While `.projects` retrieves the projects with the current type, this method also retrieves sub types and the projects with sub types of the current type. """ return _query_relatives([self], "projects") # type: ignore class ArtifactProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_artifact") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, default=None, related_name="links_artifactproject", ) class Meta: app_label = "lamindb" # can have the same label linked to the same artifact if the feature is different unique_together = ("artifact", "project", "feature") class RunProject(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_run") created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, editable=False, default=current_user_id, related_name="+", ) """Creator of record.""" class Meta: app_label = "lamindb" unique_together = ("run", "project") class BranchProject(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) branch: Branch = ForeignKey("Branch", CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_branch") class Meta: app_label = "lamindb" unique_together = ("branch", "project") class ProjectUser(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) project: Project = ForeignKey(Project, CASCADE, related_name="links_user") user: User = ForeignKey("User", PROTECT, related_name="links_project") role: str = CharField(max_length=32, db_index=True) """Role (e.g. "responsible", "viewer").""" class Meta: app_label = "lamindb" unique_together = ("project", "user", "role") class TransformProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_transform") class Meta: app_label = "lamindb" unique_together = ("transform", "project") class CollectionProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_project" ) project: Project = ForeignKey(Project, PROTECT, related_name="links_collection") class Meta: app_label = "lamindb" unique_together = ("collection", "project") class ULabelProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) ulabel: ULabel = ForeignKey(ULabel, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_ulabel") class Meta: app_label = "lamindb" unique_together = ("ulabel", "project") class FeatureProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) feature: Feature = ForeignKey(Feature, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_feature") class Meta: app_label = "lamindb" unique_together = ("feature", "project") class SchemaProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_schema") class Meta: app_label = "lamindb" unique_together = ("schema", "project") # for annotation of records with references, RecordReference is for storing reference values class ReferenceRecord(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_record") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, default=None, related_name="links_referencerecord", ) record: Record = ForeignKey(Record, CASCADE, related_name="links_reference") class Meta: app_label = "lamindb" unique_together = ("reference", "feature", "record") class RecordReference(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_reference") feature: Feature = ForeignKey( Feature, PROTECT, related_name="links_recordreference" ) value: Reference = ForeignKey(Reference, PROTECT, related_name="links_in_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") # for annotation of records with projects, RecordProject is for storing project values class ProjectRecord(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) project: Project = ForeignKey(Project, PROTECT, related_name="links_record") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, default=None, related_name="links_projectrecord", ) record: Record = ForeignKey(Record, CASCADE, related_name="links_project") class Meta: app_label = "lamindb" unique_together = ("project", "feature", "record") class RecordProject(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_project") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordproject") value: Project = ForeignKey(Project, PROTECT, related_name="links_in_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") class BlockProject(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) block = ForeignKey("Block", CASCADE, related_name="links_project") project: Project = ForeignKey(Project, PROTECT, related_name="links_block") class Meta: app_label = "lamindb" unique_together = ("block", "project") class ArtifactReference(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_reference") reference: Reference = ForeignKey(Reference, PROTECT, related_name="links_artifact") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, default=None, related_name="links_artifactreference", ) class Meta: app_label = "lamindb" unique_together = ("artifact", "reference", "feature") class TransformReference(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) transform: Transform = ForeignKey( Transform, CASCADE, related_name="links_reference" ) reference: Reference = ForeignKey( Reference, PROTECT, related_name="links_transform" ) class Meta: app_label = "lamindb" unique_together = ("transform", "reference") class CollectionReference(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_reference" ) reference: Reference = ForeignKey( Reference, PROTECT, related_name="links_collection" ) class Meta: app_label = "lamindb" unique_together = ("collection", "reference") ================================================ FILE: lamindb/models/query_manager.py ================================================ from __future__ import annotations import re from functools import reduce from typing import TYPE_CHECKING, Literal, NamedTuple from django.db.models import ( IntegerField, Manager, Q, QuerySet, TextField, Value, ) from django.db.models.functions import Cast, Coalesce from django.db.models.lookups import ( Contains, Exact, IContains, IExact, IRegex, IStartsWith, Regex, StartsWith, ) from lamin_utils._lookup import Lookup from lamindb_setup.core import deprecated from lamindb_setup.core._docs import doc_args if TYPE_CHECKING: from ..base.types import StrField def _search( cls, string: str, *, field: StrField | list[StrField] | None = None, limit: int | None = 20, case_sensitive: bool = False, truncate_string: bool = False, ) -> QuerySet: """Search. Args: string: The input string to match against the field ontology values. field: The field or fields to search. Search all string fields by default. limit: Maximum amount of top results to return. case_sensitive: Whether the match is case sensitive. Returns: A sorted `DataFrame` of search results with a score in column `score`. If `return_queryset` is `True`. `QuerySet`. See Also: :meth:`~lamindb.models.SQLRecord.filter` :meth:`~lamindb.models.SQLRecord.lookup` Examples: :: records = ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save() ln.ULabel.search("Label2") """ if string is None: raise ValueError("Cannot search for None value! Please pass a valid string.") input_queryset = ( cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all() ) registry = input_queryset.model name_field = getattr(registry, "_name_field", "name") if field is None: fields = [ field.name for field in registry._meta.fields if field.get_internal_type() in {"CharField", "TextField"} ] else: if not isinstance(field, list): fields_input = [field] else: fields_input = field fields = [] for field in fields_input: if not isinstance(field, str): try: fields.append(field.field.name) except AttributeError as error: raise TypeError( "Please pass a SQLRecord string field, e.g., `CellType.name`!" ) from error else: fields.append(field) if truncate_string: if (len_string := len(string)) > 5: n_80_pct = int(len_string * 0.8) string = string[:n_80_pct] string = string.strip() string_escape = re.escape(string) exact_lookup = Exact if case_sensitive else IExact regex_lookup = Regex if case_sensitive else IRegex contains_lookup = Contains if case_sensitive else IContains ranks = [] contains_filters = [] for field in fields: field_expr = Coalesce( Cast(field, output_field=TextField()), Value(""), output_field=TextField(), ) # exact rank exact_expr = exact_lookup(field_expr, string) exact_rank = Cast(exact_expr, output_field=IntegerField()) * 200 ranks.append(exact_rank) # exact synonym synonym_expr = regex_lookup(field_expr, rf"(?:^|.*\|){string_escape}(?:\|.*|$)") synonym_rank = Cast(synonym_expr, output_field=IntegerField()) * 200 ranks.append(synonym_rank) # match as sub-phrase sub_expr = regex_lookup( field_expr, rf"(?:^|.*[ \|\.,;:]){string_escape}(?:[ \|\.,;:].*|$)" ) sub_rank = Cast(sub_expr, output_field=IntegerField()) * 10 ranks.append(sub_rank) # startswith and avoid matching string with " " on the right # mostly for truncated startswith_expr = regex_lookup( field_expr, rf"(?:^|.*\|){string_escape}[^ ]*(?:\|.*|$)" ) startswith_rank = Cast(startswith_expr, output_field=IntegerField()) * 8 ranks.append(startswith_rank) # match as sub-phrase from the left, mostly for truncated right_expr = regex_lookup(field_expr, rf"(?:^|.*[ \|]){string_escape}.*") right_rank = Cast(right_expr, output_field=IntegerField()) * 2 ranks.append(right_rank) # match as sub-phrase from the right left_expr = regex_lookup(field_expr, rf".*{string_escape}(?:$|[ \|\.,;:].*)") left_rank = Cast(left_expr, output_field=IntegerField()) * 2 ranks.append(left_rank) # simple contains filter contains_expr = contains_lookup(field_expr, string) contains_filter = Q(contains_expr) contains_filters.append(contains_filter) # also rank by contains contains_rank = Cast(contains_expr, output_field=IntegerField()) ranks.append(contains_rank) # additional rule for truncated strings # weight matches from the beginning of the string higher # sometimes whole words get truncated and startswith_expr is not enough if truncate_string and field == name_field: startswith_lookup = StartsWith if case_sensitive else IStartsWith name_startswith_expr = startswith_lookup(field_expr, string) name_startswith_rank = ( Cast(name_startswith_expr, output_field=IntegerField()) * 2 ) ranks.append(name_startswith_rank) ranked_queryset = ( input_queryset.filter(reduce(lambda a, b: a | b, contains_filters)) .alias(rank=sum(ranks)) .order_by("-rank") ) return ranked_queryset[:limit] def _lookup( cls, field: StrField | None = None, return_field: StrField | None = None, using_key: str | None = None, keep: Literal["first", "last", False] = "first", ) -> NamedTuple: """Return an auto-complete object for a field. Args: field: The field to look up the values for. Defaults to first string field. return_field: The field to return. If `None`, returns the whole record. keep: When multiple records are found for a lookup, how to return the records. - `"first"`: return the first record. - `"last"`: return the last record. - `False`: return all records. Returns: A `NamedTuple` of lookup information of the field values with a dictionary converter. See Also: :meth:`~lamindb.models.SQLRecord.search` Examples: Lookup via auto-complete on `.`:: import bionty as bt bt.Gene.from_source(symbol="ADGB-DT").save() lookup = bt.Gene.lookup() lookup.adgb_dt Look up via auto-complete in dictionary:: lookup_dict = lookup.dict() lookup_dict['ADGB-DT'] Look up via a specific field:: lookup_by_ensembl_id = bt.Gene.lookup(field="ensembl_gene_id") genes.ensg00000002745 Return a specific field value instead of the full record:: lookup_return_symbols = bt.Gene.lookup(field="ensembl_gene_id", return_field="symbol") """ from .sqlrecord import get_name_field queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all() field = get_name_field(registry=queryset.model, field=field) return Lookup( records=queryset, values=[i.get(field) for i in queryset.values()], tuple_name=cls.__class__.__name__, prefix="ln", keep=keep, ).lookup( return_field=( get_name_field(registry=queryset.model, field=return_field) if return_field is not None else None ) ) # this is the default (._default_manager and ._base_manager) for lamindb models class QueryManager(Manager): """Manage queries through fields. See Also: :class:`lamindb.models.QuerySet` `django Manager `__ Examples: Populate the `.parents` ManyToMany relationship (a `QueryManager`):: ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save() labels = ln.ULabel.filter(name__icontains="label") label1 = ln.ULabel.get(name="Label1") label1.parents.set(labels) Convert all linked parents to a `DataFrame`:: label1.parents.to_dataframe() """ def to_list(self, field: str | None = None): """Populate a list.""" if field is None: return list(self.all()) else: return list(self.values_list(field, flat=True)) def to_dataframe(self, **kwargs): """Convert to DataFrame. For `**kwargs`, see :meth:`lamindb.models.QuerySet.to_dataframe`. """ return self.all().to_dataframe(**kwargs) @deprecated(new_name="to_dataframe") def df(self, **kwargs): return self.to_dataframe(**kwargs) @doc_args(_search.__doc__) def search(self, string: str, **kwargs): """{}""" # noqa: D415 return _search(cls=self.all(), string=string, **kwargs) @doc_args(_lookup.__doc__) def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple: """{}""" # noqa: D415 return _lookup(cls=self.all(), field=field, **kwargs) def get_queryset(self): from .query_set import BasicQuerySet # QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get # for a query set used by the default manager return BasicQuerySet(model=self.model, using=self._db, hints=self._hints) # below is just for typing / docs # Django achieves the same thing with a dynamically generated class class RelatedManager(QueryManager): """Manager for many-to-many and reverse foreign key relationships. Provides relationship manipulation methods. See Also: :class:`lamindb.models.QueryManager` Examples: Populate the `.parents` ManyToMany relationship (a `RelatedManager`):: ln.ULabel.from_values(["Label1", "Label2", "Label3"]).save() labels = ln.ULabel.filter(name__icontains="label") label1 = ln.ULabel.get(name="Label1") label1.parents.set(labels) Convert all linked parents to a `DataFrame`:: label1.parents.to_dataframe() Remove a parent label:: label1.parents.remove(label2) Clear all parent labels:: label1.parents.clear() """ def add(self, *objs, bulk: bool = True) -> None: """Add objects to the relationship.""" ... def set(self, objs, *, bulk: bool = True, clear: bool = False) -> None: """Set the relationship to the specified objects.""" ... def remove(self, *objs, bulk: bool = True) -> None: """Remove objects from the relationship.""" ... def clear(self) -> None: """Remove all objects from the relationship.""" ... ================================================ FILE: lamindb/models/query_set.py ================================================ from __future__ import annotations import ast import re import warnings from collections import UserList, defaultdict from collections.abc import Iterable from collections.abc import Iterable as IterableType from importlib import import_module from typing import TYPE_CHECKING, Any, Generic, NamedTuple, TypeVar, final import lamindb_setup as ln_setup from django.core.exceptions import FieldError from django.db import models from django.db.models import ( F, FilteredRelation, ForeignKey, ManyToManyField, Q, Subquery, ) from django.db.models.fields.related import ForeignObjectRel from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core import deprecated from lamindb_setup.core._docs import doc_args from ..base.types import BRANCH_STATUS_TO_CODE, RUN_STATUS_TO_CODE from ..errors import DoesNotExist, MultipleResultsFound from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned from .can_curate import CanCurate, _inspect, _standardize, _validate from .query_manager import _lookup, _search from .sqlrecord import Registry, SQLRecord if TYPE_CHECKING: import pandas as pd from bionty.models import ( CellLine, CellMarker, CellType, DevelopmentalStage, Disease, Ethnicity, ExperimentalFactor, Gene, Organism, Pathway, Phenotype, Protein, Tissue, ) from pertdb.models import ( Biologic, CombinationPerturbation, Compound, CompoundPerturbation, EnvironmentalPerturbation, GeneticPerturbation, PerturbationTarget, ) from lamindb.base.types import ListLike, StrField from lamindb.models import ( Artifact, Branch, Collection, Feature, Project, Record, Reference, Run, Schema, Space, Storage, Transform, ULabel, User, ) T = TypeVar("T") def get_keys_from_df(data: list, registry: SQLRecord) -> list[str]: if len(data) > 0: if isinstance(data[0], dict): keys = list(data[0].keys()) else: keys = list(data[0].__dict__.keys()) if "_state" in keys: keys.remove("_state") else: keys = [ field.name for field in registry._meta.fields if not isinstance(field, models.ForeignKey) ] keys += [ f"{field.name}_id" for field in registry._meta.fields if isinstance(field, models.ForeignKey) ] return keys def get_default_branch_ids(branch: Branch | None = None) -> list[int]: """Return branch IDs to include in default queries. By default, queries include records on the main branch (branch_id=1) but exclude trashed (branch_id=-1) and archived records (branch_id=0). This matches behavior of familiar tools like GitHub, Slack, and email clients. If a user switches to another branch via `lamin switch branch`, the main branch will still be included. Returns: List containing the default branch and current branch if different. """ if branch is None: branch_id = setup_settings.branch.id else: branch_id = branch.id branch_ids = [branch_id] if branch_id != 1: # add the main branch by default branch_ids.append(1) return branch_ids def one_helper( self: QuerySet | SQLRecordList, does_not_exist_msg: str | None = None, raise_doesnotexist: bool = True, not_exists: bool | None = None, raise_multipleresultsfound: bool = True, ): if not_exists is None: if isinstance(self, SQLRecordList): not_exists = len(self) == 0 else: not_exists = not self.exists() # type: ignore if not_exists: if raise_doesnotexist: raise DoesNotExist(does_not_exist_msg) else: return None elif len(self) > 1: if raise_multipleresultsfound: raise MultipleResultsFound(self) else: return self[0] else: return self[0] def get_backward_compat_filter_kwargs(queryset, expressions): from lamindb.models import ( Artifact, Branch, Feature, Project, Run, ) if issubclass(queryset.model, IsVersioned): name_mappings = { "version": "version_tag", } else: name_mappings = {} if queryset.model is Artifact: name_mappings.update( { "transform": "run__transform", "feature_sets": "schemas", } ) if queryset.model is Feature: name_mappings.update( { "dtype": "_dtype_str", "dtype_as_str": "_dtype_str", } ) if queryset.model in {Run, Branch, Project}: name_mappings.update( { "status": "_status_code", } ) # If no mappings to apply, return expressions as-is if not name_mappings: return expressions was_list = False if isinstance(expressions, list): was_list = True expressions = {field: True for field in expressions} mapped = {} status_mapping = None if queryset.model is Run: status_mapping = RUN_STATUS_TO_CODE elif queryset.model is Branch: status_mapping = BRANCH_STATUS_TO_CODE def _map_status_value(value): if status_mapping is None: return value if isinstance(value, str): if value not in status_mapping: expected = ", ".join(f"'{status}'" for status in status_mapping) raise ValueError( f"Invalid {queryset.model.__name__} status '{value}'. " f"Expected one of: {expected}." ) return status_mapping[value] if isinstance(value, IterableType) and not isinstance(value, str): return [ status_mapping[v] if isinstance(v, str) and v in status_mapping else v for v in value ] return value for field, value in expressions.items(): parts = field.split("__") if parts[0] in name_mappings: # Issue deprecation warnings if queryset.model is Artifact and parts[0] == "feature_sets": warnings.warn( "Querying Artifact by `feature_sets` is deprecated. Use `schemas` instead.", DeprecationWarning, stacklevel=4, ) elif queryset.model is Feature and parts[0] == "dtype": warnings.warn( "Querying Feature by `dtype` is deprecated. Use `dtype_as_str` instead. " "Notice the new dtype encoding format for Record and ULabel subtypes.", DeprecationWarning, stacklevel=4, ) new_field = name_mappings[parts[0]] + ( "__" + "__".join(parts[1:]) if len(parts) > 1 else "" ) mapped[new_field] = ( _map_status_value(value) if parts[0] == "status" else value ) else: mapped[field] = value return list(mapped.keys()) if was_list else mapped def process_expressions(queryset: QuerySet, queries: tuple, expressions: dict) -> dict: def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]: if isinstance(value, SQLRecord): if value._state.db != target_db: logger.warning( f"passing record from database {value._state.db} to query {target_db}, matching on uid '{value.uid}'" ) return f"{key}__uid", value.uid return key, value if ( key.endswith("__in") and isinstance(value, IterableType) and not isinstance(value, str) ): if any( isinstance(v, SQLRecord) and v._state.db != target_db for v in value ): logger.warning( f"passing records from another database to query {target_db}, matching on uids" ) return key.replace("__in", "__uid__in"), [ v.uid if isinstance(v, SQLRecord) else v for v in value ] return key, value return key, value branch_fields = {"branch", "branch_id"} branch_prefixes = ("branch__", "branch_id__") def queries_contain_branch(queries: tuple) -> bool: """Check if any Q object in queries references branch or branch_id.""" def check_q_object(q: Q) -> bool: # Q objects store their conditions in q.children for child in q.children: if isinstance(child, tuple) and len(child) == 2: # Normal condition: (key, value) key = child[0] if key in branch_fields or key.startswith(branch_prefixes): return True elif isinstance(child, Q): # Nested Q object if check_q_object(child): return True return False return any(check_q_object(q) for q in queries if isinstance(q, Q)) expressions = get_backward_compat_filter_kwargs( queryset, expressions, ) model_has_branch = any( field.name == "branch" for field in queryset.model._meta.concrete_fields ) if issubclass(queryset.model, SQLRecord) or model_has_branch: # branch_id is set to 1 unless expressions contains id, uid or hash id_uid_hash = {"id", "uid", "hash", "id__in", "uid__in", "hash__in"} if not any(expression in id_uid_hash for expression in expressions): expressions_have_branch = False for expression in expressions: if expression in branch_fields or expression.startswith( branch_prefixes ): expressions_have_branch = True break if not expressions_have_branch and not queries_contain_branch(queries): expressions["branch_id__in"] = get_default_branch_ids() else: # if branch_id is None, do not apply a filter # otherwise, it would mean filtering for NULL values, which doesn't make # sense for a non-NULLABLE column if "branch_id" in expressions and expressions["branch_id"] is None: expressions.pop("branch_id") if "branch" in expressions and expressions["branch"] is None: expressions.pop("branch") if queryset._db is not None: # only check for database mismatch if there is a defined database on the # queryset return dict( ( _map_databases(value, key, queryset._db) for key, value in expressions.items() ) ) else: return expressions def get( registry_or_queryset: Registry | BasicQuerySet, idlike: int | str | None = None, **expressions, ) -> SQLRecord: if isinstance(registry_or_queryset, BasicQuerySet): # not QuerySet but only BasicQuerySet assert not isinstance(registry_or_queryset, QuerySet) # noqa: S101 qs = registry_or_queryset registry = qs.model else: qs = BasicQuerySet(model=registry_or_queryset) registry = registry_or_queryset if isinstance(idlike, int): return qs.get(id=idlike) elif isinstance(idlike, str): NAME_FIELD = ( registry._name_field if hasattr(registry, "_name_field") else "name" ) DOESNOTEXIST_MSG = f"No record found with uid '{idlike}'. Did you forget a keyword as in {registry.__name__}.get({NAME_FIELD}='{idlike}')?" # this is the case in which the user passes an under-specified uid if issubclass(registry, IsVersioned) and len(idlike) <= registry._len_stem_uid: new_qs = qs.filter(uid__startswith=idlike, is_latest=True) not_exists = None if not new_qs.exists(): # also try is_latest is False due to nothing found new_qs = qs.filter(uid__startswith=idlike, is_latest=False) else: not_exists = False # it doesn't make sense to raise MultipleResultsFound when querying with an # underspecified uid return one_helper( new_qs, DOESNOTEXIST_MSG, not_exists=not_exists, raise_multipleresultsfound=False, ) else: qs = qs.filter(uid__startswith=idlike) return one_helper(qs, DOESNOTEXIST_MSG) else: assert idlike is None # noqa: S101 expressions = process_expressions(qs, [], expressions) # inject is_latest for consistency with idlike is_latest_was_not_in_expressions = "is_latest" not in expressions if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions: expressions["is_latest"] = True try: return qs.get(**expressions) except registry.DoesNotExist as e: # handle the case in which the is_latest injection led to a missed query if "is_latest" in expressions and is_latest_was_not_in_expressions: expressions.pop("is_latest") result = qs.filter(**expressions).order_by("-created_at").first() if result is not None: return result raise e class SQLRecordList(UserList, Generic[T]): """Is ordered, can't be queried, but has `.to_dataframe()`.""" def __init__(self, records: Iterable[T]): if isinstance(records, list): self.data = records # Direct assignment if already a list, no copy else: super().__init__(records) # Let UserList handle the conversion def to_dataframe(self) -> pd.DataFrame: import pandas as pd keys = get_keys_from_df(self.data, self.data[0].__class__) values = [record.__dict__ for record in self.data] return pd.DataFrame(values, columns=keys) @deprecated(new_name="to_dataframe") def df(self) -> pd.DataFrame: return self.to_dataframe() def to_list( self, field: str | None = None ) -> list[str]: # meaningful to be parallel with to_list() in QuerySet if field is None: return self.data return [getattr(record, field) for record in self.data] def one(self) -> T: """Exactly one result. Throws error if there are more or none.""" return one_helper(self) def save(self) -> SQLRecordList[T]: """Save all records to the database.""" from lamindb.models.save import save save(self) return self def get_basic_field_names( qs: QuerySet, include: list[str], features_input: bool | list[str] | str, ) -> list[str]: exclude_field_names = ["updated_at"] include_private_fields = False if "privates" in include: include_private_fields = True include.remove("privates") field_names = [ field.name for field in qs.model._meta.fields if ( not isinstance(field, models.ForeignKey) and field.name not in exclude_field_names and ( not field.name.startswith("_") or include_private_fields or (field.name == "_dtype_str" and qs.model.__name__ == "Feature") ) ) ] # TODO: harmonize with L1023 in sqlrecord.py for field_name in [ "version_tag", "is_latest", "is_locked", "is_type", "created_at", "updated_at", "created_on", ]: if field_name in field_names: field_names.append(field_names.pop(field_names.index(field_name))) field_names += [ f"{field.name}_id" for field in qs.model._meta.fields if isinstance(field, models.ForeignKey) ] # move uid to first position if present if "uid" in field_names: field_names.insert(0, field_names.pop(field_names.index("uid"))) # move primary key to second position if present pk = qs.model._meta.pk.name if qs.model._meta.pk else None if pk and pk in field_names: field_names.insert(1, field_names.pop(field_names.index(pk))) if ( include or features_input ): # if there is features_input, reduce fields to just the first 3 subset_field_names = field_names[:3] intersection = set(field_names) & set(include) subset_field_names += list(intersection) field_names = subset_field_names return field_names def get_feature_annotate_kwargs( registry: Registry, features: bool | list[str] | str | None, qs: QuerySet | None = None, ) -> tuple[dict[str, Any], QuerySet, dict[str, Any]]: from lamindb.models import ( Artifact, Feature, Record, RecordJson, Run, ULabel, ) from lamindb.models.feature import parse_dtype if registry not in {Artifact, Record, Run}: raise ValueError( f'include="features" is only applicable for Artifact, Record, and Run, not {registry.__name__}' ) feature_ids = [] if features == "queryset": ids_list = qs.values_list("id", flat=True) for obj in registry._meta.related_objects: related_name_attr = getattr(registry, obj.related_name, None) if related_name_attr is None or not hasattr(related_name_attr, "through"): continue link_model = related_name_attr.through if ( not hasattr(link_model, "feature") or link_model.__name__ == "Record_parents" ): continue filter_field = registry.__name__.lower() if not hasattr(link_model, filter_field): potential_fields = [] for field in link_model._meta.get_fields(): if field.is_relation and field.related_model is registry: potential_fields.append(field.name) if len(potential_fields) == 1: filter_field = potential_fields[0] else: continue links = link_model.objects.using(qs.db).filter( **{filter_field + "_id__in": ids_list} ) feature_ids_for_link_model = links.values_list("feature__id", flat=True) feature_ids += feature_ids_for_link_model if registry is Record: # this request is not strictly necessary, but it makes the resulting reshaped # dataframe consistent feature_ids += RecordJson.filter(record_id__in=ids_list).values_list( "feature__id", flat=True ) feature_ids = list(set(feature_ids)) # remove duplicates feature_qs = Feature.connect(None if qs is None else qs.db).filter( _dtype_str__isnull=False ) if isinstance(features, list): feature_qs = feature_qs.filter(name__in=features) if len(features) != feature_qs.count(): logger.warning( f"found features and passed features differ:\n - passed: {features}\n - found: {feature_qs.to_list('name')}" ) elif feature_ids: feature_qs = feature_qs.filter(id__in=feature_ids) else: feature_qs = feature_qs.filter( ~Q(_dtype_str__startswith="cat[") | Q(_dtype_str__startswith="cat[ULabel") | Q(_dtype_str__startswith="cat[Record") ) logger.important( f"queried for all categorical features of dtypes Record or ULabel and non-categorical features: ({len(feature_qs)}) {feature_qs.to_list('name')}" ) # Duplicate feature names map to ambiguous dataframe columns. We keep a single # feature per name for query annotation and warn loudly to surface this. feature_name_to_ids: dict[str, list[int]] = defaultdict(list) for feature in feature_qs.order_by("id"): feature_name_to_ids[feature.name].append(feature.id) duplicate_feature_names = { name: ids for name, ids in feature_name_to_ids.items() if len(ids) > 1 } if duplicate_feature_names: logger.warning( "detected duplicate feature names while building dataframe features; " "keeping the first feature per name by ascending id. " f"duplicates: {duplicate_feature_names}" ) unique_feature_ids = [ids[0] for ids in feature_name_to_ids.values()] feature_qs = feature_qs.filter(id__in=unique_feature_ids) # Get the categorical features cat_feature_types = { parse_dtype(feature._dtype_str)[0]["registry_str"] for feature in feature_qs if feature._dtype_str.startswith("cat[") or feature._dtype_str.startswith("list[cat[") } # fields to annotate cat_feature_fields = defaultdict(list) for feature in feature_qs: dtype_str = feature._dtype_str if dtype_str.startswith("cat[") or dtype_str.startswith("list[cat["): dtype_info = parse_dtype(dtype_str)[0] registry_str = dtype_info["registry_str"] field_name = dtype_info["field_str"] cat_feature_fields[registry_str].append(field_name) # Get relationships of labels and features link_models_on_models = { getattr( registry, obj.related_name ).through.__get_name_with_module__(): obj.related_model for obj in registry._meta.related_objects if obj.related_model.__get_name_with_module__() in cat_feature_types and hasattr(getattr(registry, obj.related_name), "through") and hasattr(getattr(registry, obj.related_name).through, "feature_id") } if registry is Artifact: link_models_on_models["ArtifactULabel"] = ULabel elif registry is Record: link_models_on_models["RecordRecord"] = Record link_attributes_on_models = { obj.related_name: link_models_on_models[ obj.related_model.__get_name_with_module__() ] for obj in registry._meta.related_objects if ( obj.related_model.__get_name_with_module__() in link_models_on_models and ( not obj.related_name.startswith("links_record") if registry is Record else True ) ) } # Prepare Django's annotate for features with filtering filtered_relations = {} annotate_kwargs = {} for link_attr, feature_type_model in link_attributes_on_models.items(): feature_type = feature_type_model.__get_name_with_module__() if link_attr == "links_project" and registry is Record: # we're only interested in _values_project when "annotating" records continue # Determine field name if registry in {Artifact, Run}: field_name = ( feature_type.split(".")[1] if "." in feature_type else feature_type ).lower() else: field_name = "value" # Determine if this value model needs branch filtering # Skip user relations (RecordUser, ArtifactUser don't have branch) should_filter_branch = link_attr not in {"values_user", "links_user"} # Create filtered relation for the value model value_relation_path = f"{link_attr}__{field_name}" filtered_value_relation_name = f"filtered_{link_attr}_{field_name}" if should_filter_branch: filtered_relations[filtered_value_relation_name] = FilteredRelation( value_relation_path, condition=Q( **{ f"{value_relation_path}__branch_id__in": get_default_branch_ids() } ), ) else: # No branch filtering needed filtered_relations[filtered_value_relation_name] = FilteredRelation( value_relation_path ) # Add annotation for feature name (feature doesn't have branch_id) annotate_kwargs[f"{link_attr}__feature__name"] = F( f"{link_attr}__feature__name" ) # Add annotations for categorical feature fields using the filtered relation for field in cat_feature_fields[feature_type]: annotate_kwargs[f"{link_attr}__{field_name}__{field}"] = F( f"{filtered_value_relation_name}__{field}" ) # Handle JSON values (no branch filtering needed) json_values_attribute = ( "json_values" if registry in {Artifact, Run} else "values_json" ) annotate_kwargs[f"{json_values_attribute}__feature__name"] = F( f"{json_values_attribute}__feature__name" ) annotate_kwargs[f"{json_values_attribute}__value"] = F( f"{json_values_attribute}__value" ) return annotate_kwargs, feature_qs, filtered_relations # https://claude.ai/share/16280046-6ae5-4f6a-99ac-dec01813dc3c def analyze_lookup_cardinality( model_class: SQLRecord, lookup_paths: list[str] | None ) -> dict[str, str]: """Analyze lookup cardinality. Analyzes Django model lookups to determine if they will result in one-to-one or one-to-many relationships when used in annotations. Args: model_class: The Django model class to analyze include: List of lookup paths (e.g. ["created_by__name", "ulabels__name"]) Returns: Dictionary mapping lookup paths to either 'one' or 'many' """ result = {} # type: ignore if lookup_paths is None: return result for lookup_path in lookup_paths: parts = lookup_path.split("__") current_model = model_class is_many = False # Walk through each part of the lookup path for part in parts[:-1]: # Exclude the last part as it's an attribute field = None # Handle reverse relations for f in current_model._meta.get_fields(): if isinstance(f, ForeignObjectRel) and f.get_accessor_name() == part: field = f is_many = not f.one_to_one if hasattr(f, "field"): current_model = f.field.model break # Handle forward relations if field is None: field = current_model._meta.get_field(part) if isinstance(field, ManyToManyField): is_many = True current_model = field.remote_field.model elif isinstance(field, ForeignKey): current_model = field.remote_field.model result[lookup_path] = "many" if is_many else "one" return result def reorder_subset_columns_in_df( df: pd.DataFrame, column_order: list[str], position=3 ) -> pd.DataFrame: """Reorder subset of columns in dataframe to specified position.""" valid_columns = [col for col in column_order if col in df.columns] all_cols = df.columns.tolist() remaining_cols = [col for col in all_cols if col not in valid_columns] new_order = remaining_cols[:position] + valid_columns + remaining_cols[position:] return df[new_order] def encode_lamindb_fields_as_columns( registry: Registry, fields: str | list[str] ) -> str | dict[str, str]: """Encode laminDB specific fields in dataframe with __lamindb_{model_name}_{field_name}__. This is needed when reshaping dataframes with features to avoid conflicts between laminDB fields and feature names. """ def encode(field: str) -> str: return f"__lamindb_{registry._meta.model_name}_{field}__" registry_field_names = {field.name for field in registry._meta.concrete_fields} if isinstance(fields, str): return encode(fields) if fields in registry_field_names else fields return {field: encode(field) for field in fields if field in registry_field_names} # https://lamin.ai/laminlabs/lamindata/transform/BblTiuKxsb2g0003 # https://claude.ai/chat/6ea2498c-944d-4e7a-af08-29e5ddf637d2 def reshape_annotate_result( registry: Registry, df: pd.DataFrame, field_names: list[str], cols_from_include: dict[str, str] | None, feature_qs: QuerySet | None, ) -> pd.DataFrame: """Reshapes tidy table to wide format. Args: registry: The registry model (e.g., Artifact) df: Input dataframe with experimental data field_names: List of basic fields to include in result cols_from_include: Dict specifying additional columns to process with types ('one' or 'many'), e.g., {'ulabels__name': 'many', 'created_by__name': 'one'} feature_qs: QuerySet of features """ import pandas as pd from lamindb.models import Artifact, Run cols_from_include = cols_from_include or {} # Initialize result with basic fields (need a copy since we're modifying it) result = df[field_names].copy() pk_name = registry._meta.pk.name # ========== no features requested ========== if feature_qs is None or not feature_qs.exists(): if cols_from_include: result = process_cols_from_include(df, result, cols_from_include, pk_name) return result.drop_duplicates(subset=[pk_name]) # ========== process features ========== # Encode Django field names to avoid conflicts with feature names fields_map = encode_lamindb_fields_as_columns(registry, df.columns) df_encoded = df.rename(columns=fields_map) result_encoded = result.rename(columns=fields_map) pk_name_encoded = fields_map.get(pk_name) # type: ignore # --- Process JSON-stored feature values --- json_values_attribute = ( "json_values" if registry in {Artifact, Run} else "values_json" ) feature_name_col = f"{json_values_attribute}__feature__name" feature_value_col = f"{json_values_attribute}__value" if all(col in df_encoded.columns for col in [feature_name_col, feature_value_col]): # Separate dict and non-dict values for different aggregation strategies is_dict_or_list = df_encoded[feature_value_col].apply( lambda x: isinstance(x, (dict, list)) ) dict_or_list_df = df_encoded[is_dict_or_list] non_dict_or_list_df = df_encoded[~is_dict_or_list] # Aggregate: sets for non-dict values, first for dict values groupby_cols = [pk_name_encoded, feature_name_col] non_dict_or_list_features = non_dict_or_list_df.groupby(groupby_cols)[ feature_value_col ].agg(set) dict_or_list_features = dict_or_list_df.groupby(groupby_cols)[ feature_value_col ].agg("first") # Combine and pivot to wide format combined_features = pd.concat( [non_dict_or_list_features, dict_or_list_features] ) feature_values = combined_features.unstack().reset_index() if not feature_values.empty: result_encoded = result_encoded.join( feature_values.set_index(pk_name_encoded), on=pk_name_encoded, ) # --- Process categorical/linked features --- links_prefix = "links_" if registry in {Artifact, Run} else ("links_", "values_") links_features = [ col for col in df.columns if "feature__name" in col and col.startswith(links_prefix) ] if links_features: result_encoded = process_links_features( df_encoded, result_encoded, links_features, feature_qs, pk_name_encoded, ) # --- Apply type conversions based on feature metadata --- def extract_and_check_scalar(series: pd.Series) -> tuple[pd.Series, bool]: """Extract single elements and return if column is now scalar.""" has_multiple_values = False def extract_and_track(value): nonlocal has_multiple_values if not hasattr(value, "__len__") or isinstance(value, str): return value if len(value) != 1: has_multiple_values = True return value return next(iter(value)) extracted = series.apply(extract_and_track) is_scalar = not has_multiple_values return extracted, is_scalar for feature in feature_qs: if feature.name not in result_encoded.columns: continue result_encoded[feature.name], is_scalar = extract_and_check_scalar( result_encoded[feature.name] ) if is_scalar: dtype_str = feature._dtype_str if dtype_str.startswith("cat"): result_encoded[feature.name] = result_encoded[feature.name].astype( "category" ) if dtype_str == "datetime": # format and utc args are needed for mixed data # pandera expects timezone-naive datetime objects, and hence, # we need to localize with None result_encoded[feature.name] = pd.to_datetime( result_encoded[feature.name], format="ISO8601", utc=True ).dt.tz_localize(None) if dtype_str == "date": # see comments for datetime result_encoded[feature.name] = ( pd.to_datetime( result_encoded[feature.name], format="ISO8601", utc=True, ) .dt.tz_localize(None) .dt.date ) if dtype_str == "bool": result_encoded[feature.name] = result_encoded[feature.name].astype( "boolean" ) dtype_str = feature._dtype_str if dtype_str.startswith("list"): mask = result_encoded[feature.name].notna() result_encoded.loc[mask, feature.name] = result_encoded.loc[ mask, feature.name ].apply(lambda x: list(x) if isinstance(x, (set, list)) else [x]) if dtype_str == "dict": # this is the case when a dict is stored as a string; won't happen # within lamindb but might for external data if isinstance(result_encoded[feature.name].iloc[0], str): result_encoded[feature.name] = result_encoded[feature.name].apply( lambda x: ast.literal_eval(x) if isinstance(x, str) else x ) # --- Finalize result --- # Reorder columns to prioritize features result_encoded = reorder_subset_columns_in_df( result_encoded, feature_qs.to_list("name"), # type: ignore ) # Process additional included columns if cols_from_include: cols_from_include_encoded = { fields_map.get(k, k): v # type: ignore for k, v in cols_from_include.items() } result_encoded = process_cols_from_include( df_encoded, result_encoded, cols_from_include_encoded, pk_name_encoded ) # Decode field names back to original, except where conflicts exist # (e.g., if a feature is also named 'id', keep the encoded field name) decode_map = { encoded: original for original, encoded in fields_map.items() # type: ignore if original not in result_encoded.columns } return result_encoded.drop_duplicates(subset=[pk_name_encoded]).rename( columns=decode_map ) def process_links_features( df: pd.DataFrame, result: pd.DataFrame, feature_cols: list[str], feature_qs: QuerySet | None, pk_name: str = "id", ) -> pd.DataFrame: """Process links_XXX feature columns.""" import pandas as pd from lamindb.models.feature import parse_dtype # this loops over different entities that might be linked under a feature for feature_col in feature_cols: links_attribute = "links_" if feature_col.startswith("links_") else "values_" regex = f"{links_attribute}(.+?)__feature__name" prefix = re.match(regex, feature_col).group(1) value_cols = [ col for col in df.columns if col.startswith(f"{links_attribute}{prefix}__") and "feature__name" not in col ] if not value_cols: continue value_col = value_cols[0] feature_names = df[feature_col].unique() feature_names = feature_names[~pd.isna(feature_names)] for feature in feature_qs: if feature.name not in feature_names: continue if feature.name in result.columns: continue field_name = parse_dtype(feature._dtype_str)[0]["field_str"] value_col = [c for c in value_cols if c.endswith(f"__{field_name}")][0] mask = (df[feature_col] == feature.name) & df[value_col].notna() feature_values = df[mask].groupby(pk_name)[value_col].agg(set) result.insert(3, feature.name, result[pk_name].map(feature_values)) return result def process_cols_from_include( df: pd.DataFrame, result: pd.DataFrame, extra_columns: dict[str, str], pk_name: str = "id", ) -> pd.DataFrame: """Process additional columns based on their specified types.""" for col, col_type in extra_columns.items(): if col not in df.columns: continue if col in result.columns: continue values = df.groupby(pk_name)[col].agg(set if col_type == "many" else "first") result.insert(3, col, result[pk_name].map(values)) return result def _queryset_class_factory( registry: Registry, queryset_cls: type[models.QuerySet] ) -> type[models.QuerySet]: from lamindb.models import Artifact, ArtifactSet # If the model is Artifact, create a new class for BasicQuerySet or QuerySet that inherits from ArtifactSet. # This allows to add artifact specific functionality to all classes inheriting from BasicQuerySet. # Thus all query sets of artifacts (and only of artifacts) will have functions from ArtifactSet. if registry is Artifact and not issubclass(queryset_cls, ArtifactSet): new_cls = type( "Artifact" + queryset_cls.__name__, (queryset_cls, ArtifactSet), {} ) else: new_cls = queryset_cls return new_cls class BasicQuerySet(models.QuerySet): """Sets of records returned by queries. See Also: `django QuerySet `__ Examples: Any filter statement produces a query set:: queryset = Registry.filter(name__startswith="keyword") """ def __new__(cls, model=None, query=None, using=None, hints=None): # see comments in _queryset_class_factory return object.__new__(_queryset_class_factory(model, cls)) def _to_class( self, cls: type[models.QuerySet], copy: bool = True ) -> models.QuerySet: qs = self.all() if copy else self qs.__class__ = cls return qs def _to_basic(self, copy: bool = True) -> BasicQuerySet: cls = _queryset_class_factory(self.model, BasicQuerySet) return self._to_class(cls, copy) def _to_non_basic(self, copy: bool = True) -> QuerySet: cls = _queryset_class_factory(self.model, QuerySet) return self._to_class(cls, copy) @doc_args(SQLRecord.to_dataframe.__doc__) def to_dataframe( self, *, include: str | list[str] | None = None, features: str | list[str] | None = None, limit: int | None = 100, order_by: str | None = "-id", ) -> pd.DataFrame: """{}""" # noqa: D415 import pandas as pd if ( self.model.__name__ == "Artifact" and "kind" not in str(self.query.where) and self.query.low_mark == 0 # this should be 0, not None, it represent OFFSET = 0 and self.query.high_mark is None # this should be None, it represent _no_ LIMIT ): subset = self.exclude(**{"kind__startswith": "__lamindb"}) else: subset = self # check if queryset is already ordered is_ordered = bool(subset.query.order_by) # Only apply order_by if not already ordered and order_by is specified if not is_ordered and order_by is not None: subset = subset.order_by(order_by) is_truncated = False if limit is not None: # Fetch one extra row as a sentinel to detect truncation without count(). subset = subset[: limit + 1] if include is None: include_input = [] elif isinstance(include, str): include_input = [include] else: include_input = include if "features" in include_input: include_input.remove("features") if features is None: # indicate the default features with True # should refactor this in the future features = True # type: ignore features_input = [] if features is None else features include = get_backward_compat_filter_kwargs(subset, include_input) field_names = get_basic_field_names(subset, include_input, features_input) annotate_kwargs = {} filtered_relations = {} # type: ignore feature_qs = None if features: feature_annotate_kwargs, feature_qs, filtered_relations = ( get_feature_annotate_kwargs(subset.model, features, subset) ) annotate_kwargs.update(feature_annotate_kwargs) if include_input: include_input = include_input.copy()[::-1] # type: ignore include_kwargs = {s: F(s) for s in include_input if s not in field_names} annotate_kwargs.update(include_kwargs) if annotate_kwargs: id_subquery = subset.values("id") # for annotate, we want the queryset without filters so that joins don't affect the annotations query_set_without_filters = subset.model.objects.using(subset.db).filter( id__in=Subquery(id_subquery) ) if subset.query.order_by: # Apply the same ordering to the new queryset query_set_without_filters = query_set_without_filters.order_by( *subset.query.order_by ) if filtered_relations: query_set_without_filters = query_set_without_filters.annotate( **filtered_relations ) queryset = query_set_without_filters.annotate(**annotate_kwargs) else: queryset = subset # our main problem with this approach is that we lose ordering in categorical lists # we'd need to respect ordering through the primary key on the link table, but that's # another refactoring effort # we have the correct ordering in `features.get_values()`, though df = pd.DataFrame(queryset.values(*field_names, *list(annotate_kwargs.keys()))) if limit is not None and len(df) > limit: is_truncated = True df = df.iloc[:limit].copy() if len(df) == 0: df = pd.DataFrame({}, columns=field_names) return df cols_from_include = analyze_lookup_cardinality(self.model, include_input) # type: ignore df_reshaped = reshape_annotate_result( self.model, df, field_names, cols_from_include, feature_qs ) pk_name = self.model._meta.pk.name encoded_pk_name = encode_lamindb_fields_as_columns(self.model, pk_name) if encoded_pk_name in df_reshaped.columns: df_reshaped = df_reshaped.set_index(encoded_pk_name) else: pk_column_name = pk_name if pk_name in df.columns else f"{pk_name}_id" if pk_column_name in df_reshaped.columns: df_reshaped = df_reshaped.set_index(pk_column_name) # cast floats and ints where appropriate # this is currently needed because the UI writes into the JSON field through JS # and thus a `10` might be a float, not an int # note: also type casting within reshape_annotate_result if feature_qs is not None: for feature in feature_qs: if feature.name in df_reshaped.columns: current_dtype = df_reshaped[feature.name].dtype dtype_str = feature._dtype_str if dtype_str == "int" and not pd.api.types.is_integer_dtype( current_dtype ): df_reshaped[feature.name] = df_reshaped[feature.name].astype( "Int64" # nullable integer dtype ) elif dtype_str == "float" and not pd.api.types.is_float_dtype( current_dtype ): df_reshaped[feature.name] = df_reshaped[feature.name].astype( float ) if is_truncated: logger.warning( f"truncated query result to limit={limit} {self.model.__name__} objects" ) return df_reshaped @deprecated(new_name="to_dataframe") def df( self, include: str | list[str] | None = None, features: bool | list[str] | str | None = None, ) -> pd.DataFrame: return self.to_dataframe(include=include, features=features) def describe(self, return_str: bool = False) -> str | None: """Describe the query set to learn about available fields.""" return self.model.describe(return_str=return_str) def delete(self, *args, permanent: bool | None = None, **kwargs): """Delete all records in the query set. Args: permanent: Whether to permanently delete the record (skips trash). Is only relevant for records that have the `branch` field. If `None`, uses soft delete for records that have the `branch` field, hard delete otherwise. Note: Calling `delete()` twice on the same queryset does NOT permanently delete in bulk operations. Use `permanent=True` for actual deletion. Examples: For a `QuerySet` object `qs`, call:: qs.delete() """ from lamindb.models import Artifact, Collection, Run, Storage, Transform if self.model is Run: if permanent is True: from .run import _permanent_delete_runs _permanent_delete_runs(self) return if permanent is not True: self.update(branch_id=-1) return if self.model is Transform: if permanent is True: from .transform import _permanent_delete_transforms _permanent_delete_transforms(self) return if permanent is not True: _adjust_is_latest_when_deleting_is_versioned(self) self.update(branch_id=-1, is_latest=False) return # Artifact, Collection: non-trivial delete behavior, handle in a loop if self.model in {Artifact, Collection}: for record in self: record.delete(*args, permanent=permanent, **kwargs) elif self.model is Storage: # storage does not have soft delete if permanent is False: raise ValueError( "Soft delete is not possible for Storage, " "use 'permanent=True' or 'permanent=None' for permanent deletion." ) for record in self: record.delete() else: if not permanent and hasattr(self.model, "branch_id"): logger.warning("moved records to trash (branch_id = -1)") self.update(branch_id=-1) else: if permanent is False: raise ValueError( f"Soft delete is not possible for {self.model.__name__}, " "use 'permanent=True' for permanent deletion." ) super().delete(*args, **kwargs) def to_list(self, field: str | None = None) -> list[SQLRecord] | list[str]: """Populate an (unordered) list with the results. Note that the order in this list is only meaningful if you ordered the underlying query set with `.order_by()`. Examples:: queryset.to_list() # list of records queryset.to_list("name") # list of values """ if field is None: return list(self) else: # list casting is necessary because values_list does not return a list return list(self.values_list(field, flat=True)) def first(self) -> SQLRecord | None: """If non-empty, the first result in the query set, otherwise ``None``. Examples:: queryset.first() """ if len(self) == 0: return None return self[0] def one(self) -> SQLRecord: """Exactly one result. Raises error if there are more or none.""" return one_helper(self) def one_or_none(self) -> SQLRecord | None: """At most one result. Returns it if there is one, otherwise returns ``None``. Examples:: ULabel.filter(name="benchmark").one_or_none() ULabel.filter(name="non existing label").one_or_none() """ return one_helper(self, raise_doesnotexist=False) @doc_args(_search.__doc__) def search(self, string: str, **kwargs): """{}""" # noqa: D415 return _search(cls=self, string=string, **kwargs) @doc_args(_lookup.__doc__) def lookup(self, field: StrField | None = None, **kwargs) -> NamedTuple: """{}""" # noqa: D415 return _lookup(cls=self, field=field, **kwargs) # ------------------------------------------------------------------------------------- # CanCurate # ------------------------------------------------------------------------------------- @doc_args(CanCurate.validate.__doc__) def validate(self, values: ListLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 return _validate(cls=self, values=values, field=field, **kwargs) @doc_args(CanCurate.inspect.__doc__) def inspect(self, values: ListLike, field: str | StrField | None = None, **kwargs): """{}""" # noqa: D415 return _inspect(cls=self, values=values, field=field, **kwargs) @doc_args(CanCurate.standardize.__doc__) def standardize( self, values: Iterable, field: str | StrField | None = None, **kwargs ): """{}""" # noqa: D415 return _standardize(cls=self, values=values, field=field, **kwargs) # this differs from BasicQuerySet only in .filter and .get # QueryManager returns BasicQuerySet because it is problematic to redefine .filter and .get # for a query set used by the default manager class QuerySet(BasicQuerySet): """Sets of records returned by queries. Implements additional filtering capabilities. See Also: `django QuerySet `__ Examples: >>> ULabel(name="my label").save() >>> queryset = ULabel.filter(name="my label") >>> queryset # an instance of QuerySet """ def _handle_unknown_field(self, error: FieldError) -> None: """Suggest available fields if an unknown field was passed.""" if "Cannot resolve keyword" in str(error): field = str(error).split("'")[1] avail_fields = self.model.__get_available_fields__() fields = ", ".join(sorted(avail_fields)) raise FieldError( f"Unknown field '{field}'. Available fields: {fields}" ) from None raise error # pragma: no cover def get(self, idlike: int | str | None = None, **expressions) -> SQLRecord: """Query a single record. Raises error if there are more or none.""" is_run_input = expressions.pop("is_run_input", False) # artifacts_from_path and get accept only BasicQuerySet qs = self._to_class(BasicQuerySet, copy=True) if path := expressions.pop("path", None): from .artifact_set import ArtifactSet, artifacts_from_path if not isinstance(self, ArtifactSet): raise ValueError("Querying by path is only possible for artifacts.") qs = artifacts_from_path(qs, path) try: record = get(qs, idlike, **expressions) except ValueError as e: # Pass through original error for explicit id lookups if "Field 'id' expected a number" in str(e): if "id" in expressions: raise field = next(iter(expressions)) raise FieldError( f"Invalid lookup '{expressions[field]}' for {field}. Did you mean {field}__name?" ) from None raise # pragma: no cover except FieldError as e: self._handle_unknown_field(e) raise # pragma: no cover if is_run_input is not False: # might be None or True or Run from .artifact import Artifact, track_run_input from .collection import Collection if isinstance(record, (Artifact, Collection)): track_run_input(record, is_run_input) return record def filter(self, *queries, **expressions) -> QuerySet: """Query a set of records.""" from lamindb.models import Artifact, Record, Run from .feature import FeaturePredicate feature_predicates = [q for q in queries if isinstance(q, FeaturePredicate)] queries = tuple(q for q in queries if not isinstance(q, FeaturePredicate)) registry = self.model is_status_filter_on_run = registry is Run and any( key.split("__")[0] == "status" for key in expressions ) can_filter_with_features = registry in { Artifact, Run, Record, } if ( not expressions.pop("_skip_filter_with_features", False) and can_filter_with_features and not is_status_filter_on_run ): from ._feature_manager import filter_with_features qs = filter_with_features(self, *queries, **expressions) else: # Suggest to use __name for related fields such as id when not passed for field, value in expressions.items(): if ( isinstance(value, str) and value.strip("-").isalpha() and "__" not in field and hasattr(registry, field) ): field_attr = getattr(registry, field) if hasattr(field_attr, "field") and field_attr.field.related_model: raise FieldError( f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?" ) expressions = process_expressions(self, queries, expressions) # need to run a query if queries or expressions are not empty if queries or expressions: try: qs = super().filter(*queries, **expressions) except FieldError as e: self._handle_unknown_field(e) else: qs = self if feature_predicates: if not can_filter_with_features: raise FieldError( f"Feature predicates are only supported for Artifact, Run, and Record, not {registry.__name__}." ) from ._feature_manager import filter_with_feature_predicates # Run predicate translation on a BasicQuerySet clone. # - `copy=True` avoids mutating `qs.__class__` in place while we temporarily # switch query set type for this translation phase. # - We intentionally do not use `_skip_filter_with_features` here: that flag # guards the QuerySet.filter() feature dispatcher path, while this code # bypasses that dispatcher and executes predicate translation directly. qs = filter_with_feature_predicates( qs._to_class(BasicQuerySet, copy=True), feature_predicates )._to_class(type(qs), copy=False) return qs @final class NonInstantiableQuerySet: """Wrapper around QuerySet that prevents instantiation while preserving query methods.""" def __init__(self, qs: QuerySet, registry_name: str): self._qs = qs self._name = registry_name def __repr__(self) -> str: return f"" def __call__(self, *args, **kwargs): raise TypeError( f"Cannot instantiate {self._name} from DB. " f"Use {self._name}.filter(), {self._name}.get(), etc. to query records." ) def __getattr__(self, attr): return getattr(self._qs, attr) class ModuleNamespace: """Namespace for accessing registries from a specific schema module. Args: query_db: Parent DB instance. module_name: Name of the schema module (e.g., 'bionty', 'pertdb'). """ def __init__(self, query_db: DB, module_name: str): self._query_db = query_db self._module_name = module_name self._cache: dict[str, NonInstantiableQuerySet] = {} def __getattr__(self, name: str) -> NonInstantiableQuerySet: """Access a registry class from this schema module. Args: name: Registry class name (e.g., 'Gene', 'CellType'). Returns: QuerySet for the specified registry scoped to the parent instance. """ if name in self._cache: return self._cache[name] try: schema_module = import_module(self._module_name) if hasattr(schema_module, name): model_class = getattr(schema_module, name) queryset = model_class.connect(self._query_db._instance) wrapped = NonInstantiableQuerySet(queryset, name) self._cache[name] = wrapped return wrapped except (ImportError, AttributeError): pass raise AttributeError( f"Registry '{name}' not found in lamindb. Use .bt.{name} or .pertdb.{name} for schema-specific registries." ) def __dir__(self) -> list[str]: """Return list of available registries in this schema module.""" base_attrs = [attr for attr in object.__dir__(self) if not attr.startswith("_")] try: schema_module = import_module(self._module_name) if hasattr(schema_module, "__all__"): registries = set() for class_name in schema_module.__all__: model_class = getattr(schema_module, class_name, None) if model_class and hasattr(model_class, "connect"): registries.add(class_name) return sorted(set(base_attrs) | registries) except ImportError: pass return base_attrs class BiontyDB(ModuleNamespace): """Namespace for Bionty registries (Gene, CellType, Disease, etc.).""" Gene: QuerySet[Gene] # type: ignore[type-arg] Protein: QuerySet[Protein] # type: ignore[type-arg] CellType: QuerySet[CellType] # type: ignore[type-arg] Disease: QuerySet[Disease] # type: ignore[type-arg] Phenotype: QuerySet[Phenotype] # type: ignore[type-arg] Pathway: QuerySet[Pathway] # type: ignore[type-arg] Tissue: QuerySet[Tissue] # type: ignore[type-arg] CellLine: QuerySet[CellLine] # type: ignore[type-arg] CellMarker: QuerySet[CellMarker] # type: ignore[type-arg] Organism: QuerySet[Organism] # type: ignore[type-arg] ExperimentalFactor: QuerySet[ExperimentalFactor] # type: ignore[type-arg] DevelopmentalStage: QuerySet[DevelopmentalStage] # type: ignore[type-arg] Ethnicity: QuerySet[Ethnicity] # type: ignore[type-arg] class PertdbDB(ModuleNamespace): """Namespace for `PertDB` registries (Biologic, Compound, etc.).""" Biologic: QuerySet[Biologic] # type: ignore[type-arg] Compound: QuerySet[Compound] # type: ignore[type-arg] CompoundPerturbation: QuerySet[CompoundPerturbation] # type: ignore[type-arg] GeneticPerturbation: QuerySet[GeneticPerturbation] # type: ignore[type-arg] EnvironmentalPerturbation: QuerySet[EnvironmentalPerturbation] # type: ignore[type-arg] CombinationPerturbation: QuerySet[CombinationPerturbation] # type: ignore[type-arg] PerturbationTarget: QuerySet[PerturbationTarget] # type: ignore[type-arg] class DB: """Query any registry of any instance. Args: instance: Instance identifier in format "account/instance". Examples: Query objects from an instance:: db = ln.DB("laminlabs/cellxgene") Query artifacts and filter by `suffix`:: db.Artifact.filter(suffix=".h5ad").to_dataframe() Get a single artifact by uid:: artifact = db.Artifact.get("abcDEF123456") Query records and filter by name:: db.Record.filter(name__startswith="sample").to_dataframe() Get a cell type object:: t_cell = db.bionty.CellType.get(name="T cell") Create a lookup object to auto-complete all cell types in the database:: cell_types = db.bionty.CellType.lookup() Return a `DataFrame` with additional info:: db.Artifact.filter( suffix=".h5ad", description__contains="immune", size__gt=1e9, # size > 1GB cell_types__name__in=["B cell", "T cell"], ).order_by("created_at").to_dataframe( include=["cell_types__name", "created_by__handle"] # include additional info ).head() """ Artifact: QuerySet[Artifact] # type: ignore[type-arg] Collection: QuerySet[Collection] # type: ignore[type-arg] Transform: QuerySet[Transform] # type: ignore[type-arg] Run: QuerySet[Run] # type: ignore[type-arg] User: QuerySet[User] # type: ignore[type-arg] Storage: QuerySet[Storage] # type: ignore[type-arg] Feature: QuerySet[Feature] # type: ignore[type-arg] ULabel: QuerySet[ULabel] # type: ignore[type-arg] Record: QuerySet[Record] # type: ignore[type-arg] Schema: QuerySet[Schema] # type: ignore[type-arg] Project: QuerySet[Project] # type: ignore[type-arg] Reference: QuerySet[Reference] # type: ignore[type-arg] Branch: QuerySet[Branch] # type: ignore[type-arg] Space: QuerySet[Space] # type: ignore[type-arg] bionty: BiontyDB pertdb: PertdbDB def __init__(self, instance: str): self._instance = instance self._cache: dict[str, NonInstantiableQuerySet | BiontyDB | PertdbDB] = {} self._available_registries: set[str] | None = None owner, instance_name = ( ln_setup._connect_instance.get_owner_name_from_identifier(instance) ) instance_info = ln_setup._connect_instance._connect_instance( owner=owner, name=instance_name ) self._modules = ["lamindb"] + list(instance_info.modules) def __getattr__(self, name: str) -> NonInstantiableQuerySet | BiontyDB | PertdbDB: """Access a registry class or schema namespace for this database instance. Args: name: Registry class name (e.g., 'Artifact', 'Collection') or schema namespace ('bionty', 'pertdb'). Returns: QuerySet for the specified registry or schema namespace scoped to this instance. """ if name in self._cache: return self._cache[name] if name == "bionty": if "bionty" not in self._modules: raise AttributeError( f"Schema 'bionty' not available in instance '{self._instance}'." ) if "bionty" not in self._cache: namespace = BiontyDB(self, "bionty") self._cache["bionty"] = namespace return self._cache["bionty"] if name == "pertdb": if "pertdb" not in self._modules: raise AttributeError( f"Schema 'pertdb' not available in instance '{self._instance}'." ) if "pertdb" not in self._cache: namespace = PertdbDB(self, "pertdb") # type: ignore self._cache["pertdb"] = namespace return self._cache["pertdb"] try: lamindb_module = import_module("lamindb") if hasattr(lamindb_module, name): model_class = getattr(lamindb_module, name) queryset = model_class.connect(self._instance) wrapped = NonInstantiableQuerySet(queryset, name) self._cache[name] = wrapped return wrapped except (ImportError, AttributeError): pass raise AttributeError( f"Registry '{name}' not found in lamindb core registries. Use .bionty.{name} or .pertdb.{name} for schema-specific registries." ) def __repr__(self) -> str: return f"DB('{self._instance}')" def __dir__(self) -> list[str]: """Return list of available registries and schema namespaces.""" base_attrs = [attr for attr in super().__dir__() if not attr.startswith("_")] lamindb_registries = set() try: lamindb_module = import_module("lamindb") if hasattr(lamindb_module, "__all__"): for class_name in lamindb_module.__all__: model_class = getattr(lamindb_module, class_name, None) if model_class and hasattr(model_class, "connect"): lamindb_registries.add(class_name) except ImportError: pass module_namespaces = set() if "bionty" in self._modules: module_namespaces.add("bionty") if "pertdb" in self._modules: module_namespaces.add("pertdb") return sorted(set(base_attrs) | lamindb_registries | module_namespaces) ================================================ FILE: lamindb/models/record.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any, overload import pgtrigger from django.conf import settings as django_settings from django.db import models from django.db.models import CASCADE, PROTECT from lamin_utils import logger from lamindb.base.fields import ( CharField, DateTimeField, ForeignKey, JSONField, TextField, ) from lamindb.base.utils import class_and_instance_method, strict_classmethod from lamindb.errors import FieldValidationError from ..base.uids import base62_16 from .artifact import Artifact from .can_curate import CanCurate from .collection import Collection from .feature import Feature, convert_to_pandas_dtype from .has_parents import HasParents, _query_relatives from .query_set import ( QuerySet, encode_lamindb_fields_as_columns, get_default_branch_ids, reorder_subset_columns_in_df, ) from .run import Run, TracksRun, TracksUpdates, User, current_run, current_user_id from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs from .transform import Transform from .ulabel import ULabel if TYPE_CHECKING: from datetime import datetime import pandas as pd from ._feature_manager import FeatureManager from .block import RecordBlock from .project import Project, RecordProject, RecordReference, Reference from .query_manager import RelatedManager from .query_set import SQLRecordList from .schema import Schema # keep docstring in sync with test_record_docstring_examples in test_record_basics.py IMPORTS_UID = "W3WdiFRZTvTJajNp" SCHEMA_IMPORTS_UID = "DGZkj4yhGWMJE5fu" class RecordBatch: """DataFrame-backed batch created by :meth:`Record.from_dataframe`.""" def __init__( self, *, cls: type[Record], df: pd.DataFrame, resolved_type: Record, name_field: str, ) -> None: self._cls = cls self._df = df self._resolved_type = resolved_type self._name_field = name_field self._records: list[Record] | None = None def __len__(self) -> int: return len(self._df) @property def type(self) -> Record: return self._resolved_type def _build_records(self) -> list[Record]: import pandas as pd records: list[Record] = [] row_dicts = self._df.to_dict(orient="records") for row in row_dicts: if self._name_field in row: name = row.pop(self._name_field) elif "name" in row: name = row.pop("name") else: name = None if pd.api.types.is_scalar(name) and pd.isna(name): name = None features: dict[str, Any] = {} for key, value in row.items(): if pd.api.types.is_scalar(value) and pd.isna(value): continue features[key] = value record_kwargs: dict[str, Any] = {"type": self._resolved_type} if features: record_kwargs["features"] = features records.append(self._cls(name=name, **record_kwargs)) return records def save(self) -> SQLRecordList[Record]: """Persist all records and their feature values.""" from .query_set import SQLRecordList from .save import save as ln_save if self._records is None: self._records = self._build_records() ln_save(self._records) return SQLRecordList(self._records) class Record(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates): """Flexible records with sheets & markdown pages. Useful for managing samples, donors, cells, compounds, sequences, and other custom entities with their features. If you just want a simple label, use :class:`~lamindb.ULabel`. Args: name: `str | None = None` A name. description: `str | None = None` A description. type: `Record | None = None` The type of this record. is_type: `bool = False` Whether this record is a type (a record that classifies other records). features: `dict[str | Feature, Any] | None = None` Lazy feature values to persist on `.save()` or `ln.save([...])`. schema: `Schema | None = None` A schema defining allowed features for records of this type. Only applicable when `is_type=True`. reference: `str | None = None` For instance, an external ID or a URL. reference_type: `str | None = None` For instance, `"url"`. See Also: :class:`~lamindb.Feature` Dimensions of measurement (e.g. column of a sheet, attribute of a record). :class:`~lamindb.ULabel` Like `Record`, just without the ability to store features. Examples: Create a **record** with a single feature:: # create a feature if you don't yet have one gc_content = ln.Feature(name="gc_content", dtype=float).save() # create a record to track a sample sample1 = ln.Record(name="Sample 1", features={"gc_content": 0.5}).save() # describe the record sample1.describe() Group several records under a **record type**, optionally constrained with a :class:`~lamindb.Schema`:: # create a flexible record type to track experiments experiment_type = ln.Record(name="Experiment", is_type=True).save() experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save() # create a feature to link experiments experiment = ln.Feature(name="experiment", dtype=experiment_type).save() # create a record type to track samples -- constrain it with a schema schema = ln.Schema([experiment, gc_content.with_config(optional=True)], name="sample_schema").save() sample_sheet = ln.Record(name="Sample Sheet", is_type=True, schema=schema).save() # group the sample1 record under the sample sheet sample1.type = sample_sheet sample1.save() # reset the feature values for the record including the experiment sample1.features.set_values({ "gc_content": 0.5, "experiment": "Experiment 1", # automatically resolves by name, also accepts the experiment1 object }) Export all records under a type to a dataframe:: experiment_type.to_dataframe() #> __lamindb_record_name__ ... #> Experiment 1 ... #> Experiment 2 ... Import records from a dataframe :meth:`~lamindb.Record.from_dataframe`:: records = ln.Record.from_dataframe(df, type="my_df").save() # creates a type my_df with inferred schema If you try to set incomplete features in a record in a sheet, you'll get a validation error:: sample2 = ln.Record(name="Sample 2", type=sample_sheet).save() sample2.features.set_values({"gc_content": 0.6}) # raises ValidationError because experiment is missing Query records by features:: ln.Record.filter(gc_content=0.55) # exact match ln.Record.filter(gc_content__gt=0.5) # greater than ln.Record.filter(type=sample_sheet) # just the record on the sheet If your feature names are ambiguous, you can use a `Feature` object to disambiguate:: # to set feature values sample1.features.set_values({gc_content: 0.5}) # gc_content is the feature object # to query by feature values ln.Record.filter(gc_content == 0.5) # instead of gc_content=0.5 You can edit records like spreadsheets on the hub: .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/XSzhWUb0EoHOejiw0001.png :width: 800px Just like for :class:`~lamindb.ULabel`, you can also model **ontologies** through the `parents`/`children` attributes. .. dropdown:: What is the difference between `Record` and `SQLRecord`? The features of a `Record` are flexible: you can dynamically define features and add features to a record. The fields of a `SQLRecord` are static: you need to define them in code and then migrate the underlying database. See :class:`~lamindb.models.SQLRecord` or the glossary for more information: :term:`docs:record`. """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" if ( django_settings.DATABASES.get("default", {}).get("ENGINE") == "django.db.backends.postgresql" ): triggers = [ pgtrigger.Trigger( name="prevent_record_type_cycle", operation=pgtrigger.Update | pgtrigger.Insert, when=pgtrigger.Before, condition=pgtrigger.Condition("NEW.type_id IS NOT NULL"), func=""" -- Check for direct self-reference IF NEW.type_id = NEW.id THEN RAISE EXCEPTION 'Cannot set type: record cannot be its own type'; END IF; -- Check for cycles in the type chain IF EXISTS ( WITH RECURSIVE type_chain AS ( SELECT type_id, 1 as depth FROM lamindb_record WHERE id = NEW.type_id UNION ALL SELECT r.type_id, tc.depth + 1 FROM lamindb_record r INNER JOIN type_chain tc ON r.id = tc.type_id WHERE tc.depth < 100 ) SELECT 1 FROM type_chain WHERE type_id = NEW.id ) THEN RAISE EXCEPTION 'Cannot set type: would create a cycle'; END IF; RETURN NEW; """, ), ] # also see raw SQL constraints for `is_type` and `type` FK validity in migrations _name_field: str = "name" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=16, default=base62_16 ) """A universal random id, valid across DB instances.""" name: str = CharField(max_length=150, db_index=True, null=True) """Name or title of record (optional).""" type: Record | None = ForeignKey("self", PROTECT, null=True, related_name="records") """Type of record, e.g., `Sample`, `Donor`, `Cell`, `Compound`, `Sequence` ← :attr:`~lamindb.Record.records`. Allows to group records by type, e.g., all samples, all donors, all cells, all compounds, all sequences. """ records: RelatedManager[Record] """If a `type` (`is_type=True`), records of this `type`.""" description: str | None = TextField(null=True) """A description.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) """A simple reference like a URL or external ID.""" reference_type: str | None = CharField(max_length=25, db_index=True, null=True) """Type of simple reference.""" extra_data: dict | None = models.JSONField(null=True) """Additional data in JSON format, not validated as features.""" schema: Schema | None = ForeignKey( "Schema", CASCADE, null=True, related_name="records" ) """A schema to enforce for a type ← :attr:`~lamindb.Schema.records`. This is analogous to the `schema` attribute of an `Artifact`. If `is_type` is `True`, the schema is used to enforce features for each record of this type. """ linked_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordRecord", symmetrical=False, related_name="linked_in_records", ) """Records linked in this record as a value ← :attr:`~lamindb.Record.linked_in_records`.""" linked_in_records: RelatedManager[Record] """Records linking this record as a value. Is reverse accessor for `linked_records`.""" parents: RelatedManager[Record] = models.ManyToManyField( "self", symmetrical=False, related_name="children" ) """Ontological parents of this record ← :attr:`~lamindb.Record.children`. You can build an ontology under a given `type`. For example, introduce a type `CellType` and model the hiearchy of cell types under it via `parents` and `children`. """ children: RelatedManager[Record] """Ontological children of this record. Is reverse accessor for `parents`.""" # this is handled manually here because we want to se the related_name attribute # (this doesn't happen via inheritance of TracksRun, everything else is the same) run: Run | None = ForeignKey( Run, PROTECT, related_name="output_records", null=True, default=current_run, editable=False, ) """Run that created the record ← :attr:`~lamindb.Run.output_records`.""" input_of_runs: RelatedManager[Run] = models.ManyToManyField( Run, related_name="input_records" ) """Runs that use this record as an input ← :attr:`~lamindb.Run.input_records`.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( Artifact, through="ArtifactRecord", related_name="records" ) """Artifacts annotated by this record ← :attr:`~lamindb.Artifact.records`.""" runs: RelatedManager[Run] = models.ManyToManyField( Run, through="RunRecord", related_name="records" ) """Runs annotated by this record ← :attr:`~lamindb.Run.records`.""" transforms: RelatedManager[Transform] = models.ManyToManyField( Transform, through="TransformRecord", related_name="records" ) """Transforms annotated by this record ← :attr:`~lamindb.Transform.records`.""" collections: RelatedManager[Collection] = models.ManyToManyField( Collection, through="CollectionRecord", related_name="records" ) """Collections annotated by this record ← :attr:`~lamindb.Collection.records`.""" projects: RelatedManager[Project] """Projects that annotate this record ← :attr:`~lamindb.Project.records`.""" references: RelatedManager[Reference] """References that annotate this record ← :attr:`~lamindb.Reference.records`.""" linked_transforms: RelatedManager[Transform] """Transforms linked in this record as values ← :attr:`~lamindb.Transform.linked_in_records`.""" linked_runs: RelatedManager[Run] """Runs linked in this record as values ← :attr:`~lamindb.Run.linked_in_records`.""" linked_ulabels: RelatedManager[ULabel] """ULabels linked in this record as values ← :attr:`~lamindb.ULabel.linked_in_records`.""" linked_artifacts: RelatedManager[Artifact] """Artifacts linked in this record as values ← :attr:`~lamindb.Artifact.linked_in_records`.""" linked_projects: RelatedManager[Project] """Projects linked in this record as values ← :attr:`~lamindb.Project.linked_in_records`.""" linked_references: RelatedManager[Reference] """References linked in this record as values ← :attr:`~lamindb.Reference.linked_in_records`.""" linked_collections: RelatedManager[Collection] """Collections linked in this record as values ← :attr:`~lamindb.Collection.linked_in_records`.""" linked_users: RelatedManager[User] """Users linked in this record as values ← :attr:`~lamindb.User.linked_in_records`.""" ablocks: RelatedManager[RecordBlock] """Attached blocks ← :attr:`~lamindb.RecordBlock.record`.""" values_json: RelatedManager[RecordJson] """JSON values `(record_id, feature_id, value)`.""" values_record: RelatedManager[RecordRecord] """Record values with their features `(record_id, feature_id, value_id)`.""" values_ulabel: RelatedManager[RecordULabel] """ULabel values with their features `(record_id, feature_id, value_id)`.""" values_user: RelatedManager[RecordUser] """User values with their features `(record_id, feature_id, value_id)`.""" values_transform: RelatedManager[RecordTransform] """Transform values with their features `(record_id, feature_id, value_id)`.""" values_run: RelatedManager[RecordRun] """Run values with their features `(record_id, feature_id, value_id)`.""" values_artifact: RelatedManager[RecordArtifact] """Artifact values with their features `(record_id, feature_id, value_id)`.""" values_collection: RelatedManager[RecordCollection] """Collection values with their features `(record_id, feature_id, value_id)`.""" values_reference: RelatedManager[RecordReference] """Reference values with their features `(record_id, feature_id, value_id)`.""" values_project: RelatedManager[RecordProject] """Project values with their features `(record_id, feature_id, value_id)`.""" @overload def __init__( self, name: str | None = None, type: Record | None = None, is_type: bool = False, features: dict[str | Feature, Any] | None = None, description: str | None = None, schema: Schema | None = None, reference: str | None = None, reference_type: str | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None if len(args) > 0: raise ValueError("Only one non-keyword arg allowed") name: str = kwargs.pop("name", None) type: str | None = kwargs.pop("type", None) is_type: bool = kwargs.pop("is_type", False) features: dict[str | Feature, Any] | None = kwargs.pop("features", None) description: str | None = kwargs.pop("description", None) schema: Schema | None = kwargs.pop("schema", None) reference: str | None = kwargs.pop("reference", None) reference_type: str | None = kwargs.pop("reference_type", None) branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) _skip_validation = kwargs.pop("_skip_validation", False) _aux = kwargs.pop("_aux", None) if len(kwargs) > 0: valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Record)]) raise FieldValidationError( f"Only {valid_keywords} are valid keyword arguments" ) if schema and not is_type: logger.important("passing schema, treating as type") is_type = True if features is not None: self._features = features super().__init__( name=name, type=type, is_type=is_type, description=description, reference=reference, reference_type=reference_type, schema=schema, branch=branch, branch_id=branch_id, space=space, space_id=space_id, _skip_validation=_skip_validation, _aux=_aux, ) def save(self, *args, **kwargs) -> Record: super().save(*args, **kwargs) if hasattr(self, "_features"): pending_features = self._features self.features.add_values(pending_features) del self._features return self @strict_classmethod def from_dataframe( cls, df: pd.DataFrame, *, type: Record | str, name_field: str = "__lamindb_record_name__", ) -> RecordBatch: """Construct a dataframe-backed batch of records for bulk saving. Returns a :class:`RecordBatch`. Follow with `records.save()`. Args: df: A dataframe where rows represent records. type: Record type for all rows as either a `Record` object or a string. If passing a string, a new type with that name is created under `Imports` with an inferred schema from the dataframe. If that type name already exists, raise an error and pass an existing `Record` object for reuse. If the resolved type is a sheet (`type.schema is not None`), feature values are validated against that schema at save time. name_field: Column used for record names. Falls back to `name` if absent. If neither exists, records are created without names. Examples: Create a new type and import records:: records = ln.Record.from_dataframe(df, type="my_df").save() Import records into an existing type:: records = ln.Record.from_dataframe(df, type=sample_sheet).save() """ import pandas as pd from .schema import Schema if not isinstance(df, pd.DataFrame): raise TypeError("`df` needs to be a pandas DataFrame.") resolved_type: Record if isinstance(type, str): imports_type = cls.filter(uid=IMPORTS_UID).one_or_none() if imports_type is None: imports_type = cls(name="Imports", is_type=True) imports_type.uid = IMPORTS_UID imports_type = imports_type.save() existing_type = cls.filter( name=type, is_type=True, type=imports_type ).one_or_none() if existing_type is not None: raise ValueError( f"type '{type}' already exists under 'Imports', please pass it as a Record object to reuse." ) imports_schema = Schema.filter(uid=SCHEMA_IMPORTS_UID).one_or_none() if imports_schema is None: imports_schema = Schema(name="Imports", is_type=True) imports_schema.uid = SCHEMA_IMPORTS_UID imports_schema = imports_schema.save() inferred_schema = Schema.from_dataframe(df, name=type) if inferred_schema is None: raise ValueError( "Could not infer a schema from dataframe columns. " "Ensure dataframe columns map to existing Features, or pass an existing Record type object." ) inferred_schema.type = imports_schema inferred_schema = inferred_schema.save() resolved_type = cls( name=type, is_type=True, type=imports_type, schema=inferred_schema, ).save() else: resolved_type = type if not resolved_type.is_type: raise ValueError("`type` needs to be a record type (`is_type=True`).") if resolved_type.name is None: raise ValueError("`type` needs to have a non-null `name`.") return RecordBatch( cls=cls, df=df, resolved_type=resolved_type, name_field=name_field, ) @property def features(self) -> FeatureManager: """Manage the linked feature values. For examples, see :class:`~lamindb.Record` or :class:`~lamindb.models.FeatureManager`. """ from ._feature_manager import FeatureManager return FeatureManager(self) @property def is_sheet(self) -> bool: """Check if record is a `sheet`, i.e., `self.is_type and self.schema is not None`.""" return self.schema is not None and self.is_type def query_parents(self) -> QuerySet: """Query all parents of a record recursively. While `.parents` retrieves the direct parents, this method retrieves all ancestors of the current record. """ return _query_relatives([self], "parents") # type: ignore def query_children(self) -> QuerySet: """Query all children of a record recursively. While `.children` retrieves the direct children, this method retrieves all descendants of a parent. """ return _query_relatives([self], "children") # type: ignore def query_records(self) -> QuerySet: """Query records of sub types. While `.records` retrieves the records with the current type, this method also retrieves sub types and the records with sub types of the current type. """ return _query_relatives([self], "records") # type: ignore def _set_export_run(self, is_run_input: bool | Run | None = None) -> None: from lamindb.core._context import context from lamindb.models import Run, Transform if isinstance(is_run_input, Run): run = is_run_input elif is_run_input in {True, None}: if context.run is None: transform, _ = Transform.objects.get_or_create( key="__lamindb_record_export__", kind="function" ) run = Run(transform).save() else: run = context.run else: run = None self._export_run = run @class_and_instance_method def to_dataframe( cls_or_self, recurse: bool = False, is_run_input: bool | Run | None = None, **kwargs, ) -> pd.DataFrame: """Export to a pandas DataFrame. This is roughly equivalent to:: ln.Record.filter(type=sample_type).to_dataframe(include="features") `to_dataframe()` ensures that the columns are ordered according to the schema of the type and encodes fields like `uid` and `name`. It will also track the record as an input to the current run. Args: recurse: Whether to include records of sub-types recursively. is_run_input: Whether to track the record as a run input. **kwargs: Keyword arguments passed to :meth:`~lamindb.models.QuerySet.to_dataframe`. """ import pandas as pd if isinstance(cls_or_self, type): return type(cls_or_self).to_dataframe(cls_or_self, **kwargs) # type: ignore if not cls_or_self.is_type: raise TypeError( "to_dataframe() can only be called on the class or on record type instance." ) self = cls_or_self assert self.is_type, "Only types can be exported as dataframes" # noqa: S101 branch_ids = get_default_branch_ids() qs = ( self.query_records() if recurse else self.records.filter(branch_id__in=branch_ids) ) logger.important(f"exporting {qs.count()} records of '{self.name}'") if "order_by" not in kwargs: kwargs["order_by"] = "id" df = qs.to_dataframe(features="queryset", limit=None, **kwargs) encoded_id = encode_lamindb_fields_as_columns(self.__class__, "id") encoded_uid = encode_lamindb_fields_as_columns(self.__class__, "uid") encoded_name = encode_lamindb_fields_as_columns(self.__class__, "name") # encode the django id, uid and name fields if df.index.name == "id": df.index.name = encoded_id if "uid" in df.columns and encoded_uid not in df.columns: df = df.rename(columns={"uid": encoded_uid}) if "name" in df.columns and encoded_name not in df.columns: df = df.rename(columns={"name": encoded_name}) if self.schema is not None: all_features = self.schema.members.all() desired_order = all_features.to_list("name") # only members is ordered! for feature in all_features: if feature.name not in df.columns: df[feature.name] = pd.Series( dtype=convert_to_pandas_dtype(feature._dtype_str) ) else: # sort alphabetically for now desired_order = df.columns[2:].tolist() desired_order.sort() df = reorder_subset_columns_in_df(df, desired_order, position=0) # type: ignore self._set_export_run(is_run_input=is_run_input) self._export_run.input_records.add(self) return df.sort_index() # order by id def to_artifact( self, key: str | None = None, suffix: str | None = None, is_run_input: bool | Run | None = None, **kwargs, ) -> Artifact: """Calls `to_dataframe()` to create an artifact. The format defaults to `.csv` unless the key specifies another format or suffix is passed. The `key` defaults to `sheet_exports/{self.name}{suffix}` unless a `key` is passed. Args: key: `str | None = None` The artifact key. suffix: `str | None = None` The suffix to append to the default key if no key is passed. is_run_input: Whether to track the record as a run input. **kwargs: Keyword arguments passed to :meth:`~lamindb.models.Record.to_dataframe`. """ assert self.is_type, "Only types can be exported as artifacts." assert key is None or suffix is None, "Only one of key or suffix can be passed." if key is None: suffix = ".csv" if suffix is None else suffix key = f"sheet_exports/{self.name}{suffix}" description = f": {self.description}" if self.description is not None else "" return Artifact.from_dataframe( self.to_dataframe(is_run_input=is_run_input, **kwargs), key=key, description=f"Export of sheet {self.uid}{description}", schema=self.schema, csv_kwargs={"index": False}, run=self._export_run, ).save() # for storing JSON values in records class RecordJson(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_json") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordjson") value: Any = JSONField(default=None, db_default=None) class Meta: app_label = "lamindb" # a list is modeled as a list in json, hence no multi-value association for the same feature unlike for # categorical/relational values unique_together = ("record", "feature") # for storing record-like values in records class RecordRecord(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_record") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordrecord") value: Record = ForeignKey(Record, PROTECT, related_name="links_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") # for storing ulabel-like values in records class RecordULabel(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_ulabel") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordulabel") value: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_record") class Meta: # allows linking exactly one record to one ulabel per feature, because we likely don't want to have Many app_label = "lamindb" unique_together = ("record", "feature", "value") # for storing user-like values in records class RecordUser(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_user") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recorduser") value: User = ForeignKey(User, PROTECT, related_name="links_record") class Meta: # allows linking exactly one record to one user per feature, because we likely don't want to have Many app_label = "lamindb" unique_together = ("record", "feature", "value") # for storing run-like values in records class RecordRun(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_run") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordrun") value: Run = ForeignKey(Run, PROTECT, related_name="links_in_record") class Meta: # allows linking several records to a single run for the same feature because we'll likely need this app_label = "lamindb" unique_together = ("record", "feature", "value") # for annotating runs with records class RunRecord(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="links_record") record: Record = ForeignKey(Record, PROTECT, related_name="links_run") feature: Feature = ForeignKey( Feature, PROTECT, null=True, related_name="links_runrecord" ) created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) class Meta: app_label = "lamindb" unique_together = ("run", "record", "feature") # for storing artifact-like values in records class RecordArtifact(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_artifact") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_recordartifact") value: Artifact = ForeignKey(Artifact, PROTECT, related_name="links_in_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") # for annotating artifacts with records class ArtifactRecord(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_record") record: Record = ForeignKey(Record, PROTECT, related_name="links_artifact") feature: Feature = ForeignKey( Feature, PROTECT, null=True, related_name="links_artifactrecord" ) class Meta: app_label = "lamindb" unique_together = ("artifact", "record", "feature") # for storing collection-like values in records class RecordCollection(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_collection") feature: Feature = ForeignKey( Feature, PROTECT, related_name="links_recordcollection" ) value: Collection = ForeignKey(Collection, PROTECT, related_name="links_in_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") # for annotating collections with records class CollectionRecord(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_record" ) record: Record = ForeignKey(Record, PROTECT, related_name="links_collection") feature: Feature = ForeignKey( Feature, PROTECT, null=True, related_name="links_collectionrecord" ) class Meta: app_label = "lamindb" unique_together = ("collection", "record", "feature") # for storing transform-like values in records class RecordTransform(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) record: Record = ForeignKey(Record, CASCADE, related_name="values_transform") feature: Feature = ForeignKey( Feature, PROTECT, related_name="links_recordtransform" ) value: Transform = ForeignKey(Transform, PROTECT, related_name="links_in_record") class Meta: app_label = "lamindb" unique_together = ("record", "feature", "value") # for annotating transforms with records class TransformRecord(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_record") record: Record = ForeignKey(Record, PROTECT, related_name="links_transform") feature: Feature = ForeignKey( Feature, PROTECT, null=True, related_name="links_transformrecord" ) created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now() ) created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) class Meta: app_label = "lamindb" unique_together = ("transform", "record", "feature") ================================================ FILE: lamindb/models/run.py ================================================ from __future__ import annotations import os import subprocess import sys from typing import TYPE_CHECKING, overload from django.db import models from django.db.models import ( CASCADE, PROTECT, Q, ) from lamin_utils import logger from lamindb_setup import _check_instance_setup from lamindb_setup import settings as setup_settings from lamindb.base.fields import ( BooleanField, CharField, DateTimeField, ForeignKey, TextField, ) from lamindb.base.users import current_user_id from lamindb.base.utils import strict_classmethod from ..base.types import RUN_CODE_TO_STATUS from ..base.uids import base62_16 from .can_curate import CanCurate from .query_set import BasicQuerySet, QuerySet from .sqlrecord import BaseSQLRecord, IsLink, SQLRecord if TYPE_CHECKING: from datetime import datetime from lamindb.base.types import RunStatus from ._feature_manager import FeatureManager from .artifact import Artifact from .block import RunBlock from .collection import Collection from .feature import Feature, JsonValue from .project import Project from .query_manager import RelatedManager from .record import Record from .transform import Transform from .ulabel import ULabel _TRACKING_READY: bool | None = None def current_run() -> Run | None: global _TRACKING_READY if not _TRACKING_READY: _TRACKING_READY = _check_instance_setup() if _TRACKING_READY: import lamindb # also see get_run() in core._data run = lamindb.core._functions.get_current_tracked_run() if run is None: run = lamindb.context.run return run else: return None class TracksRun(models.Model): """Base class tracking latest run, creating user, and `created_at` timestamp.""" class Meta: abstract = True created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, editable=False, default=current_user_id, related_name="+", ) """Creator of record.""" run: Run | None = ForeignKey( "lamindb.Run", PROTECT, null=True, default=current_run, related_name="+" ) """Run that created record.""" class TracksUpdates(models.Model): """Base class tracking previous runs and `updated_at` timestamp.""" class Meta: abstract = True updated_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of last update to record.""" class User(BaseSQLRecord, CanCurate): """Users. Every :class:`~lamindb.models.SQLRecord` has a `created_by` field that links to the creating user. This registry is automatically populated with user identities from LaminHub in case the user authenticates. Examples: Query a user by handle:: user = ln.User.get(handle="testuser1") """ class Meta: app_label = "lamindb" _name_field: str = "handle" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField(editable=False, unique=True, db_index=True, max_length=8) """Universal id, valid across DB instances.""" handle: str = CharField(max_length=30, unique=True, db_index=True) """User handle, valid across DB instances (required).""" name: str | None = CharField(max_length=150, db_index=True, null=True) """Full name (optional).""" # has to match hub specification, where it's also optional linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordUser", related_name="linked_users" ) """This user is linked in these records as a value.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", through="ArtifactUser", through_fields=("user", "artifact"), related_name="users", ) """Artifacts annotated with this user.""" created_artifacts: RelatedManager[Artifact] """Artifacts created by user.""" created_transforms: RelatedManager[Transform] """Transforms created by user.""" created_runs: RelatedManager[Run] """Runs created by user.""" projects: RelatedManager[Project] """Projects this user is linked to (e.g. as member) ← :attr:`~lamindb.ProjectUser.project`.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of object.""" updated_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of last update to object.""" @overload def __init__( self, uid: str, handle: str, name: str | None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): super().__init__(*args, **kwargs) class Run(SQLRecord, TracksUpdates): """Runs of transforms such as the executions of a script. Args: transform: :class:`~lamindb.Transform` A data transformation object. name: `str | None = None` A name. params: `dict | None = None` A dictionary of parameters. reference: `str | None = None` For instance, an external ID or URL. reference_type: `str | None = None` For instance, `redun_id`, `nextflow_id` or `url`. initiated_by_run: `Run | None = None` The `run` that triggers this `run`. See Also: :func:`~lamindb.track` Globally track a script or notebook run. :func:`~lamindb.step` Track a function executionwith this decorator. Examples: Create a run record:: ln.Transform(key="Cell Ranger", version="7.2.0", kind="pipeline").save() transform = ln.Transform.get(key="Cell Ranger", version="7.2.0") run = ln.Run(transform) Track a global run of a notebook or script:: ln.track() ln.context.run # global run object You can pass parameters to `Run(transform, params=params)` or add them later:: run.params = { "learning_rate": 0.01, "input_dir": "s3://my-bucket/mydataset", "downsample": True, "preprocess_params": { "normalization_type": "cool", "subset_highlyvariable": True, }, } run.save() In contrast to `.params`, features are indexed in the `Feature` registry and can reference relational categorical values. If you want to link feature values, use:: run.features.set_values({ "experiment": "My experiment 1", }) Guide: :ref:`track-run-parameters` """ class Meta: app_label = "lamindb" _name_field: str = "started_at" id: int = models.BigAutoField(primary_key=True) """Internal id, valid only in one DB instance.""" # default uid was changed from base62_20 to base62_16 in 1.6.0 uid: str = CharField( editable=False, unique=True, db_index=True, max_length=20, default=base62_16 ) """Universal id, valid across DB instances.""" name: str | None = CharField(max_length=150, null=True, db_index=True) """An optional name for this run.""" description: str | None = TextField(null=True) """An optional description for this run.""" transform: Transform = ForeignKey("Transform", CASCADE, related_name="runs") """The transform that is being run ← :attr:`~lamindb.Transform.runs`.""" entrypoint: str | None = CharField(max_length=255, null=True, db_index=True) """The entrypoint of the transform. This could be a function name or the entry point of a CLI or workflow manager. """ started_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """The time this run started.""" finished_at: datetime | None = DateTimeField(db_index=True, null=True, default=None) """The time this run finished or aborted.""" # we don't want to make below a OneToOne because there could be the same trivial report # generated for many different runs report: Artifact | None = ForeignKey( "Artifact", PROTECT, null=True, related_name="_report_of", default=None ) """The report of this run such as an `.html` or `.txt` file.""" environment: Artifact | None = ForeignKey( "Artifact", PROTECT, null=True, related_name="_environment_of", default=None ) """The computational environment for this run. For instance, `Dockerfile`, `docker image`, `requirements.txt`, `environment.yml`, etc. """ plan: Artifact | None = ForeignKey( "Artifact", PROTECT, null=True, related_name="_plan_for_runs", default=None ) """The (agent) plan for this run. Also see: :attr:`~lamindb.Run.initiated_by_run`. """ input_records: RelatedManager[Record] """The collections serving as input for this run ← :attr:`~lamindb.Record.input_of_runs`.""" output_records: RelatedManager[Record] """The collections created in this run ← :attr:`~lamindb.Record.run`.""" input_artifacts: RelatedManager[Artifact] """The artifacts serving as input for this run ← :attr:`~lamindb.Artifact.input_of_runs`. """ output_artifacts: RelatedManager[Artifact] """The artifacts created in this run ← :attr:`~lamindb.Artifact.run`. This does **not** include recreated artifacts, which are tracked via :attr:`~lamindb.Run.recreated_artifacts`. If you want to query created + recreated artifacts, use :meth:`~lamindb.Run.query_output_artifacts` instead. """ recreated_artifacts: RelatedManager[Artifact] """The output artifacts that were recreated by this run ← :attr:`~lamindb.Artifact.recreating_runs`. Artifacts are *recreated* if they trigger a hash lookup match for an existing artifact. """ input_collections: RelatedManager[Collection] """The collections serving as input for this run ← :attr:`~lamindb.Collection.input_of_runs`.""" output_collections: RelatedManager[Collection] """The collections created in this run ← :attr:`~lamindb.Collection.run`.""" recreated_collections: RelatedManager[Collection] """The output collections that were recreated by this run ← :attr:`~lamindb.Collection.recreating_runs`. Collections are *recreated* if they trigger a hash lookup match for an existing collection. """ params: dict = models.JSONField(null=True) """Parameters (plain JSON values).""" json_values: RelatedManager[JsonValue] = models.ManyToManyField( "JsonValue", through="RunJsonValue", related_name="runs" ) """Feature-indexed JSON values ← :attr:`~lamindb.JsonValue.runs`.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) """A reference like a URL or an external ID such as from a workflow manager.""" reference_type: str | None = CharField(max_length=25, db_index=True, null=True) """The type of the `reference` such as a workflow manager execution ID.""" cli_args: str | None = CharField(max_length=1024, null=True, default=None) """CLI arguments if the run was invoked from the command line.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """The time of creation of this run.""" created_by: User = ForeignKey( "User", CASCADE, default=current_user_id, related_name="created_runs" ) """The creator of this run ← :attr:`~lamindb.User.created_runs`.""" ulabels: RelatedManager[ULabel] = models.ManyToManyField( "ULabel", through="RunULabel", related_name="runs" ) """The ulabels annotating this run ← :attr:`~lamindb.ULabel.runs`.""" initiated_by_run: Run | None = ForeignKey( "Run", CASCADE, null=True, related_name="initiated_runs", default=None ) """The run that initiated this run ← :attr:`~lamindb.Run.initiated_runs`.""" initiated_runs: RelatedManager[Run] """The runs that were initiated by this run.""" projects: RelatedManager[Project] """The projects annotating this run ← :attr:`~lamindb.Project.runs`.""" ablocks: RelatedManager[RunBlock] """Attached blocks ← :attr:`~lamindb.RunBlock.run`.""" records: RelatedManager[Record] """The records annotating this run ← :attr:`~lamindb.Record.runs`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordRun", related_name="linked_runs" ) """This run is linked in these records as a value ← :attr:`~lamindb.Record.linked_runs`.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", through="ArtifactRun", related_name="runs" ) """The artifacts annotated by this run ← :attr:`~lamindb.Artifact.runs`.""" linked_artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", through="RunArtifact", related_name="linked_by_runs", ) """The artifacts linked by this run through the run's features ← :attr:`~lamindb.RunArtifact.artifact`.""" _is_consecutive: bool | None = BooleanField(null=True) """Indicates whether code was consecutively executed. Is relevant for notebooks.""" _status_code: int = models.SmallIntegerField( default=-3, db_default=-3, db_index=True, ) """Status code of the run. See the status property for mapping to string.""" @overload def __init__( self, transform: Transform, name: str | None = None, description: str | None = None, entrypoint: str | None = None, params: dict | None = None, reference: str | None = None, reference_type: str | None = None, initiated_by_run: Run | None = None, plan: Artifact | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None # now we proceed with the user-facing constructor if len(args) > 1: raise ValueError("Only one non-keyword arg allowed: transform") transform: Transform = None if "transform" in kwargs or len(args) == 1: transform = kwargs.pop("transform") if len(args) == 0 else args[0] name: str | None = kwargs.pop("name", None) description: str | None = kwargs.pop("description", None) entrypoint: str | None = kwargs.pop("entrypoint", None) params: dict | None = kwargs.pop("params", None) reference: str | None = kwargs.pop("reference", None) reference_type: str | None = kwargs.pop("reference_type", None) initiated_by_run: Run | None = kwargs.pop("initiated_by_run", None) report: Artifact | None = kwargs.pop("report", None) plan: Artifact | None = kwargs.pop("plan", None) if transform is None: raise TypeError("Pass transform parameter") if transform._state.adding: raise ValueError("Please save transform record before creating a run") if not len(kwargs) == 0: raise ValueError( f"Only transform, name, description, params, reference, reference_type, initiated_by_run, plan can be passed, but you passed: {kwargs}" ) super().__init__( # type: ignore transform=transform, name=name, description=description, entrypoint=entrypoint, params=params, reference=reference, reference_type=reference_type, initiated_by_run=initiated_by_run, report=report, plan=plan, ) @property def status(self) -> RunStatus: """Run status. Get the status of the run: =========== ===== =========================== status code description =========== ===== =========================== `scheduled` -3 The run is scheduled. `restarted` -2 The run was restarted. `started` -1 The run has started. `completed` 0 The run completed successfully. `errored` 1 The run ended with an error. `aborted` 2 The run was aborted. =========== ===== =========================== The database stores the run status as an integer code in field `_status_code`. Example: See the status of a run:: run.status #> 'completed' Query by status:: ln.Run.filter(status="completed").to_dataframe() """ return RUN_CODE_TO_STATUS[self._status_code] @property def features(self) -> FeatureManager: """Manage annotations with features. For examples, see :class:`~lamindb.Run` or :class:`~lamindb.models.FeatureManager`. """ from ._feature_manager import FeatureManager return FeatureManager(self) def query_output_artifacts( self, include_recreated: bool = True ) -> QuerySet[Artifact]: """Query output artifacts including recreated ones. This runs the following query under the hood:: ln.Artifact.filter(ln.Q(run=self) | ln.Q(recreating_runs=self)).distinct() Args: include_recreated: If `True`, return both originally created and recreated artifacts. If `False`, return only originally created artifacts. Returns: A queryset of :class:`~lamindb.Artifact` objects. See Also: :attr:`~lamindb.Run.output_artifacts` `QuerySet` of originally created artifacts. :attr:`~lamindb.Run.recreated_artifacts` `QuerySet` of recreated artifacts. """ if not include_recreated: return self.output_artifacts.all() else: return self.output_artifacts.model.filter( Q(run=self) | Q(recreating_runs=self) ).distinct() @strict_classmethod def filter( cls, *queries, **expressions, ) -> QuerySet: """Query a set of artifacts. Args: *queries: `Q` expressions. **expressions: Params, fields, and values passed via the Django query syntax. See Also: - Guide: :doc:`docs:registries` Examples: Query by fields:: ln.Run.filter(key="examples/my_file.parquet") Query by params:: ln.Run.filter(hyperparam_x=100) """ # from Registry metaclass return type(cls).filter(cls, *queries, **expressions) def _permanent_delete_runs(runs: Run | QuerySet) -> None: """Execute bulk DELETE on runs and spawn artifact cleanup. Used by QuerySet and single-run paths.""" if isinstance(runs, Run): db = runs._state.db or "default" first_run_uid = runs.uid artifact_ids = [] if runs.environment_id: artifact_ids.append(runs.environment_id) if runs.report_id: artifact_ids.append(runs.report_id) super(BaseSQLRecord, runs).delete() else: db = runs.db or "default" rows = list(runs.values_list("uid", "report_id", "environment_id")) if rows: first_run_uid = rows[0][0] else: return artifact_ids = list({aid for r in rows for aid in r[1:3] if aid is not None}) super(BasicQuerySet, runs).delete() if artifact_ids: ids_str = ",".join(map(str, artifact_ids)) instance = db if db not in (None, "default") else setup_settings.instance.slug # spawn background subprocess to delete orphaned report/env artifacts cmd: list[str] = [ sys.executable, "-m", "lamindb.models._run_cleanup", "--instance", instance, "--ids", ids_str, "--run-uid", first_run_uid, ] proc = subprocess.Popen( cmd, start_new_session=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ, ) log_path = setup_settings.cache_dir / f"run_cleanup_logs_{first_run_uid}.txt" logger.important( f"spawned run cleanup subprocess (pid={proc.pid}): {log_path}\n {' '.join(cmd)}" ) class RunJsonValue(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="links_jsonvalue") # we follow the lower() case convention rather than snake case for link models jsonvalue: JsonValue = ForeignKey("JsonValue", PROTECT, related_name="links_run") created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) """Creator of record.""" class Meta: app_label = "lamindb" unique_together = ("run", "jsonvalue") # for storing artifact-like values in runs # compare RunRecord as opposed to RecordRun class RunArtifact(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="values_artifact") artifact: Artifact = ForeignKey("Artifact", PROTECT, related_name="links_in_run") feature: Feature | None = ForeignKey( "Feature", PROTECT, null=True, related_name="links_runartifact", default=None ) class Meta: app_label = "lamindb" unique_together = ("run", "artifact", "feature") ================================================ FILE: lamindb/models/save.py ================================================ # ruff: noqa: TC004 from __future__ import annotations import os import shutil import traceback from collections import defaultdict from datetime import datetime from typing import TYPE_CHECKING from django.db import IntegrityError, transaction from django.utils.functional import partition from lamin_utils import logger from lamindb_setup.core.upath import LocalPathClasses, UPath from ..core._settings import settings from .sqlrecord import ( UNIQUE_FIELD_NAMES, SQLRecord, parse_violated_field_from_error_message, ) if TYPE_CHECKING: from collections.abc import Iterable from .artifact import Artifact def save( records: Iterable[SQLRecord], ignore_conflicts: bool | None = False, batch_size: int = 10000, ) -> None: """Bulk save records. Note: This is a much faster than saving records using ``record.save()``. Warning: Bulk saving neither automatically creates related records nor updates existing records! Use ``record.save()`` for these use cases. Args: records: Multiple :class:`~lamindb.models.SQLRecord` objects. ignore_conflicts: If `True`, do not error if some records violate a unique or another constraint. However, it won't inplace update the id fields of records. If you need records with ids, you need to query them from the database. batch_size: Number of records to process in each batch. Large batch sizes can improve performance but may lead to memory issues. Examples: Save a list of records: >>> labels = [ln.ULabel(f"Label {i}") for i in range(10)] >>> ln.save(projects) For a single record, use ``record.save()``: >>> transform = ln.Transform(key="My pipeline") >>> transform.save() Update a single existing record: >>> transform = ln.Transform.get("0Cb86EZj") >>> transform.description = "New description" >>> transform.save() """ from .artifact import Artifact if isinstance(records, SQLRecord): raise ValueError("Please use record.save() if saving a single record.") # previously, this was all set based, # but models without primary keys aren't hashable # we distinguish between artifacts and non-artifacts # for artifacts, we want to bulk-upload rather than upload one-by-one non_artifacts, artifacts = partition(lambda r: isinstance(r, Artifact), records) if non_artifacts: non_artifacts_old, non_artifacts_new = partition( lambda r: r._state.adding or r.pk is None, non_artifacts ) bulk_create( non_artifacts_new, ignore_conflicts=ignore_conflicts, batch_size=batch_size ) if non_artifacts_old: bulk_update(non_artifacts_old, batch_size=batch_size) non_artifacts_with_parents = [ r for r in non_artifacts_new if hasattr(r, "_parents") ] if len(non_artifacts_with_parents) > 0: # this can only happen within bionty right now!! # we might extend to core lamindb later from bionty.core import add_ontology add_ontology(non_artifacts_with_parents) records_with_lazy_features = [ record for record in non_artifacts if record.__class__.__name__ == "Record" and hasattr(record, "_features") ] if records_with_lazy_features: from ._feature_manager import bulk_set_features_in_records bulk_set_features_in_records(records_with_lazy_features) if artifacts: with transaction.atomic(): for record in artifacts: # will switch to True after the successful upload / saving if getattr(record, "_local_filepath", None) is not None and getattr( record, "_to_store", False ): record._storage_ongoing = True record._save_skip_storage() using_key = settings._using_key store_artifacts(artifacts, using_key=using_key) # this function returns None as potentially 10k records might be saved # refreshing all of them from the DB would mean a severe performance penalty # 2nd reason: consistency with Django Model.save(), which also returns None return None def bulk_create( records: Iterable[SQLRecord], ignore_conflicts: bool | None = False, batch_size: int = 10000, ): """Create records in batches for safety and performance. Args: records: Iterable of SQLRecord objects to create ignore_conflicts: Whether to ignore conflicts during creation batch_size: Number of records to process in each batch. """ records_by_orm = defaultdict(list) for record in records: records_by_orm[record.__class__].append(record) for registry, records_list in records_by_orm.items(): total_records = len(records_list) model_name = registry.__name__ if total_records > batch_size: logger.important( f"starting creation of {total_records} {model_name} records in batches of {batch_size}" ) # Process records in batches for i in range(0, len(records_list), batch_size): batch = records_list[i : i + batch_size] batch_num = (i // batch_size) + 1 total_batches = (total_records + batch_size - 1) // batch_size if total_records > batch_size: logger.info( f"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records" ) try: registry.objects.bulk_create(batch, ignore_conflicts=ignore_conflicts) # handle unique constraint violations due to non-default branches except IntegrityError as e: error_msg = str(e) if any(field in error_msg for field in UNIQUE_FIELD_NAMES) and ( "UNIQUE constraint failed" in error_msg or "duplicate key value violates unique constraint" in error_msg ): unique_fields = parse_violated_field_from_error_message(error_msg) # Build tuples of unique field values for each record unique_field_values = [ tuple(getattr(r, field) for field in unique_fields) for r in batch ] # Build Q objects for multi-field lookup from django.db.models import Q q_objects = Q() for values in unique_field_values: field_kwargs = { unique_fields[i]: values[i] for i in range(len(unique_fields)) } q_objects |= Q(**field_kwargs) # Query against non-default branches pre_existing_records_not_main_branch = registry.objects.filter( q_objects ).exclude(branch_id=1) # Get the unique field value tuples that already exist pre_existing_value_tuples = { tuple(getattr(rec, field) for field in unique_fields) for rec in pre_existing_records_not_main_branch } # Records that can be saved normally (not in non-default branches) records_main_branch = [ r for r in batch if tuple(getattr(r, field) for field in unique_fields) not in pre_existing_value_tuples ] save(records_main_branch) # Now move the pre-existing records to the main branch if pre_existing_value_tuples: unique_fields_str = ", ".join(unique_fields) logger.warning( f"some {model_name} records with the same ({unique_fields_str}) already exist in non-default branches - moving them to the default branch" ) pre_existing_records_to_move = [ r for r in batch if tuple(getattr(r, field) for field in unique_fields) in pre_existing_value_tuples ] for record in pre_existing_records_to_move: record.save() else: raise e def bulk_update( records: Iterable[SQLRecord], ignore_conflicts: bool | None = False, batch_size: int = 10000, ): """Update records in batches for safety and performance. Args: records: Iterable of SQLRecord objects to update ignore_conflicts: Whether to ignore conflicts during update (currently unused but kept for consistency) batch_size: Number of records to process in each batch. If None, processes all at once. """ records_by_orm = defaultdict(list) for record in records: records_by_orm[record.__class__].append(record) for registry, records_list in records_by_orm.items(): total_records = len(records_list) model_name = registry.__name__ if total_records > batch_size: logger.warning( f"starting update for {total_records} {model_name} records in batches of {batch_size}" ) field_names = [ field.name for field in registry._meta.fields if (field.name != "created_at" and field.name != "id") ] # Process records in batches for i in range(0, len(records_list), batch_size): batch = records_list[i : i + batch_size] batch_num = (i // batch_size) + 1 total_batches = (total_records + batch_size - 1) // batch_size if total_records > batch_size: logger.info( f"processing batch {batch_num}/{total_batches} for {model_name}: {len(batch)} records" ) registry.objects.bulk_update(batch, field_names) # This is also used within Artifact.save() def check_and_attempt_upload( artifact: Artifact, using_key: str | None = None, access_token: str | None = None, print_progress: bool = True, **kwargs, ) -> Exception | None: # kwargs are propagated to .upload_from in the end # if Artifact object is either newly instantiated or replace() was called on # a local env it will have a _local_filepath and needs to be uploaded if getattr(artifact, "_local_filepath", None) is not None: try: storage_path, cache_path = upload_artifact( artifact, using_key, access_token=access_token, print_progress=print_progress, **kwargs, ) except Exception as exception: logger.warning(f"could not upload artifact: {artifact}") # clear dangling storages if we were actually uploading or saving if getattr(artifact, "_to_store", False): # avoid root-level import of core.storage module from ..core.storage import paths artifact._clear_storagekey = paths.auto_storage_key_from_artifact( artifact ) # type: ignore return exception # copies (if on-disk) or moves the temporary file (if in-memory) to the cache if os.getenv("LAMINDB_MULTI_INSTANCE") is None: # this happens only after the actual upload was performed # we avoid failing here in case any problems happen in copy_or_move_to_cache # because the cache copying or cleanup is not absolutely necessary try: copy_or_move_to_cache(artifact, storage_path, cache_path) except Exception as e: if not str(e).startswith( "[WinError 32] The process cannot access the file " "because it is being used by another process" ): # ignore WinError 32 error, this just means that the file is still open on save # it is saved at this point, so not a big deal if copy or move to cache fails # this mostly happens for run logs # just ignore without a warning logger.warning(f"A problem with cache on saving: {e}") # after successful upload, we should remove the attribute so that another call # call to save won't upload again, the user should call replace() then del artifact._local_filepath # returning None means proceed (either success or no action needed) return None def copy_or_move_to_cache( artifact: Artifact, storage_path: UPath, cache_path: UPath | None ): local_path = artifact._local_filepath # in-memory cases if local_path is None or not local_path.exists(): return None local_path = local_path.resolve() is_dir = local_path.is_dir() cache_dir = settings.cache_dir # just delete from the cache dir if storage_path is local if cache_path is None: if ( local_path.as_posix() != storage_path.as_posix() and cache_dir in local_path.parents ): if is_dir: shutil.rmtree(local_path) else: local_path.unlink() return None # non-local storage_path further if local_path != cache_path: if cache_path.exists(): logger.important_hint( f"replacing the existing cache path {cache_path.as_posix()}" ) if cache_path.is_dir(): shutil.rmtree(cache_path) else: cache_path.unlink() else: cache_path.parent.mkdir(parents=True, exist_ok=True) if cache_dir in local_path.parents: local_path.replace(cache_path) else: if is_dir: shutil.copytree(local_path, cache_path) else: shutil.copy(local_path, cache_path) # make sure that the cached version is older than the cloud one mts = datetime.now().timestamp() + 1.0 if is_dir: files = (file for file in cache_path.rglob("*") if file.is_file()) for file in files: os.utime(file, times=(mts, mts)) else: os.utime(cache_path, times=(mts, mts)) # This is also used within Artifact.save() def check_and_attempt_clearing( artifact: Artifact, raise_file_not_found_error: bool = True, using_key: str | None = None, ) -> Exception | None: # this is a clean-up operation after replace() was called # or if there was an exception during upload if hasattr(artifact, "_clear_storagekey"): try: if artifact._clear_storagekey is not None: # type: ignore # avoid root-level import of core.storage module from ..core.storage import paths delete_msg = paths.delete_storage_using_key( artifact, artifact._clear_storagekey, # type: ignore raise_file_not_found_error=raise_file_not_found_error, using_key=using_key, ) if delete_msg != "did-not-delete": logger.success( f"deleted stale object at storage key {artifact._clear_storagekey}" # type: ignore ) artifact._clear_storagekey = None # type: ignore except Exception as exception: return exception # returning None means proceed (either success or no action needed) return None def store_artifacts( artifacts: Iterable[Artifact], using_key: str | None = None ) -> None: """Upload artifacts in a list of database-committed artifacts to storage. If any upload fails, subsequent artifacts are cleaned up from the DB. """ from .artifact import Artifact exception: Exception | None = None # because uploads might fail, we need to maintain a new list of the succeeded uploads stored_artifacts = [] # upload new local artifacts for artifact in artifacts: # failure here sets ._clear_storagekey # for cleanup below exception = check_and_attempt_upload(artifact, using_key) if exception is not None: break stored_artifacts += [artifact] # update to show successful saving # only update if _storage_ongoing was set to True before # this should be a single transaction for the updates of all the artifacts # but then it would just abort all artifacts, even those successfully stored before # TODO: there should also be some kind of exception handling here # but this requires refactoring if artifact._storage_ongoing: artifact._storage_ongoing = False # each .save() is a separate transaction below super(Artifact, artifact).save() # if check_and_attempt_upload was successful # then this can have only ._clear_storagekey from .replace exception = check_and_attempt_clearing( artifact, raise_file_not_found_error=True, using_key=using_key ) if exception is not None: logger.warning(f"clean up of {artifact._clear_storagekey} failed") # type: ignore break if exception is not None: # clean up metadata for artifacts not uploaded to storage with transaction.atomic(): for artifact in artifacts: if artifact not in stored_artifacts: artifact._delete_skip_storage() # clean up storage after failure in check_and_attempt_upload exception_clear = check_and_attempt_clearing( artifact, raise_file_not_found_error=False, using_key=using_key ) if exception_clear is not None: logger.warning( f"clean up of {artifact._clear_storagekey} after the upload error failed" # type: ignore ) error_message = prepare_error_message(artifacts, stored_artifacts, exception) # this is bad because we're losing the original traceback # needs to be refactored - also, the orginal error should be raised raise RuntimeError(error_message) return None def prepare_error_message(records, stored_artifacts, exception) -> str: if len(stored_artifacts) == 0: error_message = ( "No entries were uploaded or committed" " to the database. See error message:\n\n" ) else: error_message = ( "The following entries have been" " successfully uploaded and committed to the database:\n" ) for record in stored_artifacts: error_message += ( f"- {', '.join(record.__repr__().split(', ')[:3]) + ', ...)'}\n" ) error_message += "\nSee error message:\n\n" error_message += f"{str(exception)}\n\n{traceback.format_exc()}" return error_message def upload_artifact( artifact, using_key: str | None = None, access_token: str | None = None, print_progress: bool = True, **kwargs, ) -> tuple[UPath, UPath | None]: """Store and add file and its linked entries.""" # kwargs are propagated to .upload_from in the end # can't currently use filepath_from_artifact here because it resolves to ._local_filepath # avoid root-level import of core.storage module from ..core.storage import paths storage_key = paths.auto_storage_key_from_artifact(artifact) storage_path, storage_settings = paths.attempt_accessing_path( artifact, storage_key, using_key=using_key, access_token=access_token ) if getattr(artifact, "_to_store", False): logger.save(f"storing artifact '{artifact.uid}' at '{storage_path}'") paths.store_file_or_folder( artifact._local_filepath, storage_path, print_progress=print_progress, **kwargs, ) if isinstance(storage_path, LocalPathClasses): cache_path = None else: cache_key = paths._cache_key_from_artifact_storage(artifact, storage_settings) cache_path = storage_settings.cloud_to_local_no_update( storage_path, cache_key=cache_key ) return storage_path, cache_path ================================================ FILE: lamindb/models/schema.py ================================================ from __future__ import annotations import warnings from typing import TYPE_CHECKING, Any, Type, overload import numpy as np from django.db import models from django.db.models import CASCADE, PROTECT, ManyToManyField, Q from lamin_utils import logger from lamindb_setup.core import deprecated from lamindb_setup.core.hashing import HASH_LENGTH, hash_string from lamindb.base.fields import ( BooleanField, CharField, ForeignKey, IntegerField, TextField, ) from lamindb.base.types import FieldAttr, ListLike from lamindb.base.uids import base62_16 from lamindb.base.utils import class_and_instance_method from lamindb.errors import FieldValidationError, InvalidArgument from lamindb.models.feature import parse_cat_dtype from ..errors import ValidationError from ._describe import describe_schema, format_rich_tree from ._relations import ( dict_related_model_to_related_name, get_related_name, ) from .can_curate import CanCurate from .feature import ( Feature, serialize_dtype, serialize_pandas_dtype, ) from .has_parents import _query_relatives from .query_set import QuerySet, SQLRecordList from .run import TracksRun, TracksUpdates from .sqlrecord import ( BaseSQLRecord, HasType, IsLink, Registry, SQLRecord, _get_record_kwargs, init_self_from_db, update_attributes, ) if TYPE_CHECKING: import pandas as pd from django.db.models.query_utils import DeferredAttribute from .artifact import Artifact from .block import SchemaBlock from .project import Project from .query_manager import RelatedManager from .record import Record NUMBER_TYPE = "num" DICT_KEYS_TYPE = type({}.keys()) # type: ignore def validate_features(features: list[SQLRecord]) -> SQLRecord: """Validate and return feature type.""" try: if len(features) == 0: raise ValueError("Provide list of features with at least one element") except TypeError: raise ValueError( "Please pass a ListLike of features, not a single feature" ) from None if not hasattr(features, "__getitem__"): raise TypeError("features has to be list-like") if not isinstance(features[0], SQLRecord): raise TypeError( "features has to store feature records! use .from_values() otherwise" ) feature_types = {feature.__class__ for feature in features} if len(feature_types) > 1: raise TypeError("schema can only contain a single type") for feature in features: if feature._state.adding: raise ValueError("Can only construct feature sets from validated features") return next(iter(feature_types)) # return value in set of cardinality 1 def get_features_config( features: list[SQLRecord] | tuple[SQLRecord, dict], ) -> tuple[list[SQLRecord], list[tuple[SQLRecord, dict]]]: """Get features and their config from the return of feature.with_config().""" features_list = [] configs = [] try: for feature in features: if isinstance(feature, tuple): features_list.append(feature[0]) configs.append(feature) # store the tuple in configs else: features_list.append(feature) return features_list, configs # type: ignore except TypeError: return features, configs # type: ignore class SchemaOptionals: """Manage and access optional features in a schema.""" def __init__(self, schema) -> None: self.schema = schema def get_uids(self) -> list[str]: """Get the uids of the optional features. Does **not** need an additional query to the database, while `get()` does. """ if ( self.schema._aux is not None and "af" in self.schema._aux and "1" in self.schema._aux["af"] ): return self.schema._aux["af"]["1"] else: return [] def get(self) -> QuerySet: """Get the optional features.""" uids = self.get_uids() if uids: return Feature.objects.filter(uid__in=uids).order_by("links_schema__id") else: return Feature.objects.none() # empty QuerySet def set(self, features: list[Feature]) -> None: """Set the optional features (overwrites whichever schemas are currently optional).""" if not isinstance(features, list) or not all( isinstance(f, Feature) for f in features ): raise TypeError("features must be a list of Feature records!") self.schema._aux = self.schema._aux or {} if len(features) > 0: self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features] def remove(self, features: Feature | list[Feature]) -> None: """Make one or multiple features required by removing them from the set of optional features.""" if not isinstance(features, list): features = [features] if not all(isinstance(f, Feature) for f in features): raise TypeError("features must be a list of Feature records!") if len(features) > 0: self.schema._aux = self.schema._aux or {} if "1" in self.schema._aux.get("af", {}): for feature in features: self.schema._aux["af"]["1"].remove(feature.uid) def add(self, features: Feature | list[Feature]) -> None: """Make one or multiple features optional by adding them to the set of optional features.""" self.schema._aux = self.schema._aux or {} if not isinstance(features, list): features = [features] if not all(isinstance(f, Feature) for f in features): raise TypeError("features must be a list of Feature records!") if len(features) > 0: if "1" not in self.schema._aux.setdefault("af", {}): self.set(features) else: self.schema._aux.setdefault("af", {})["1"].extend( [f.uid for f in features] ) KNOWN_SCHEMAS = { # by hash "kMi7B_N88uu-YnbTLDU-DA": "0000000000000000", # valid_features "1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001", # valid_ensembl_gene_ids "UR_ozz2VI2sY8ckXop2RAg": "0000000000000002", # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype='Composite') "aqGWHvyY49W_PHELUMiBMw": "0000000000000002", # anndata_ensembl_gene_ids_and_valid_features_in_obs (itype=None) } class Schema(SQLRecord, HasType, CanCurate, TracksRun, TracksUpdates): """Schemas of datasets such as column sets of dataframes. .. note:: To create a schema, at least one of the following parameters must be passed: - `features` - a list of `Feature` objects - `itype` - the identifier type, e.g., `Feature` or `bt.Gene.ensembl_gene_id` - `slots` - a dictionary mapping slots to :class:`~lamindb.Schema` objects, e.g., for an `AnnData`, `{"obs": Schema(...), "var.T": Schema(...)}` - `is_type=True` - a *schema type* to group schemas, e.g., "ProteinPanel" Args: features: `list[SQLRecord] | list[tuple[Feature, dict]] | None = None` Feature records, e.g., `[Feature(...), Feature(...)]` or features with their config, e.g., `[Feature(...).with_config(optional=True)]`. slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects to create a _composite_ schema. name: `str | None = None` Name of the schema. description: `str | None = None` Description of the schema. itype: `str | None = None` Feature identifier type to validate against, e.g., `ln.Feature` or `bt.Gene.ensembl_gene_id`. Is automatically set to the type of the passed `features`. type: `Schema | None = None` Define schema types like `ln.Schema(name="ProteinPanel", is_type=True)`. is_type: `bool = False` Whether the schema is a type. index: `Feature | None = None` A `Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices. flexible: `bool | None = None` Whether to include any feature of the same `itype` during validation & annotation. If `features` is passed, defaults to `False` so that, e.g., additional columns of a `DataFrame` encountered during validation are disregarded. If `features` is not passed, defaults to `True`. otype: `str | None = None` An object type to define the structure of a composite schema, e.g., `"DataFrame"`, `"AnnData"`. dtype: `str | None = None` A `dtype` to assume for all features in the schema (e.g., "num", float, int). Defaults to `None` if `itype` is `Feature`. Otherwise to `"num"`, e.g., if `itype` is `bt.Gene.ensembl_gene_id`. minimal_set: `bool = True` Whether all passed features are required by default. See :attr:`~lamindb.Schema.optionals` for more-fine-grained control. maximal_set: `bool = False` Whether additional features are allowed. ordered_set: `bool = False` Whether features are required to be ordered. coerce: `bool | None = None` When True, attempts to coerce values to the specified dtype during validation, see :attr:`~lamindb.Schema.coerce`. n_members: `int | None = None` A manual way of specifying the number of features in the schema. Is inferred from `features` if passed. See Also: :meth:`~lamindb.Artifact.from_dataframe` Validate & annotate a `DataFrame` with a schema. :meth:`~lamindb.Artifact.from_anndata` Validate & annotate an `AnnData` with a schema. :meth:`~lamindb.Artifact.from_mudata` Validate & annotate an `MuData` with a schema. :meth:`~lamindb.Artifact.from_spatialdata` Validate & annotate a `SpatialData` with a schema. Examples: A schema with a single required feature:: import lamindb as ln schema = ln.Schema([ln.Feature(name="required_feature", dtype=str).save()]).save() A schema that constrains feature identifiers to be a valid feature names:: schema = ln.Schema(itype=ln.Feature) # uses Feature.name as identifier type Or valid Ensembl gene ids:: import bionty as bt schema = ln.Schema(itype=bt.Gene.ensembl_gene_id) A `flexible` schema that *requires* a single feature but *also* validates & annotates additional features with registered feature identifiers:: schema = ln.Schema( [ln.Feature(name="required_feature", dtype=str).save()], itype=ln.Feature, flexible=True, ).save() Create a schema type to group schemas:: protein_panel = ln.Schema(name="ProteinPanel", is_type=True).save() schema = ln.Schema(itype=bt.CellMarker, type=protein_panel).save() Validate the `index` of a `DataFrame`:: schema = ln.Schema( [ln.Feature(name="required_feature", dtype=str).save()], index=ln.Feature(name="sample", dtype=ln.ULabel).save(), ).save() Mark a feature as `optional`:: schema = ln.Schema([ ln.Feature(name="required_feature", dtype=str).save(), ln.Feature(name="feature2", dtype=int).save().with_config(optional=True), ]).save() Parse & validate feature identifier values:: schema = ln.Schema.from_values( adata.var["ensemble_id"], field=bt.Gene.ensembl_gene_id, organism="mouse", ).save() Create a schema from a `DataFrame`:: df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]}) schema = ln.Schema.from_dataframe(df) """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" # also see raw SQL constraints for `is_type` and `type` FK validity in migrations _name_field: str = "name" _aux_fields: dict[str, tuple[str, type]] = { "1": ("optionals", list[str]), "3": ("index_feature_uid", str), } id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField(max_length=16, unique=True, db_index=True, editable=False) """A universal id.""" name: str | None = CharField(max_length=150, null=True, db_index=True) """A name.""" description: str | None = TextField(null=True) """A description.""" n_members: int | None = IntegerField(null=True, default=None) """Number of features in the schema. None for type-like schemas.""" coerce: bool | None = BooleanField(null=True, default=None) """Whether dtypes should be coerced during validation. None for type-like schemas.""" flexible: bool | None = BooleanField(null=True, default=None) """Indicates how to handle validation and annotation in case features are not defined. Examples: Make a rigid schema flexible:: schema = ln.Schema.get(name="my_schema") schema.flexible = True schema.save() During schema creation:: # if you're not passing features but just defining the itype, defaults to flexible = True schema = ln.Schema(itype=ln.Feature).save() # schema.flexible is True # if you're passing features, defaults to flexible = False schema = ln.Schema( features=[ln.Feature(name="my_required_feature", dtype=int).save()], ) # schema.flexible is False # you can also validate & annotate features in addition to those that you're explicitly defining: schema = ln.Schema( features=[ln.Feature(name="my_required_feature", dtype=int).save()], flexible=True, ) # schema.flexible is True """ type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="schemas") """Type of schema. Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal. You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`. Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`. """ schemas: RelatedManager[Schema] """Schemas of this type (can only be non-empty if `is_type` is `True`).""" itype: str | None = CharField( max_length=120, db_index=True, null=True, editable=False ) """A field of a registry that stores feature identifier types, e.g., `'Feature.name'` or `'bionty.Gene.ensembl_gene_id'`. Defaults to the default name field if a registry is passed (passing `Feature` would result in `Feature.name`). Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records. """ otype: str | None = CharField(max_length=64, db_index=True, null=True) """Default Python object type, e.g., DataFrame, AnnData.""" _dtype_str: str | None = CharField(max_length=64, null=True, editable=False) """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`. For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level. """ hash: str | None = CharField( max_length=HASH_LENGTH, db_index=True, null=True, editable=False ) """A hash of the set of feature identifiers. For a composite schema, the hash of hashes. """ minimal_set: bool = BooleanField(default=True, db_index=True, editable=False) """Whether all passed features are to be considered required by default (default `True`). Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)` are **not** required even if this `minimal_set` is true. """ ordered_set: bool = BooleanField(default=False, db_index=True, editable=False) """Whether features are required to be ordered (default `False`).""" maximal_set: bool = BooleanField(default=False, db_index=True, editable=False) """Whether all features present in the dataset must be in the schema (default `False`). If `False`, additional features are allowed to be present in the dataset. If `True`, no additional features are allowed to be present in the dataset. """ components: RelatedManager[Schema] = ManyToManyField( "self", through="SchemaComponent", symmetrical=False, related_name="composites" ) """Components of this schema ← :attr:`~lamindb.Schema.composites`.""" composites: RelatedManager[Schema] """The composite schemas that contains this schema as a component ← :attr:`~lamindb.Schema.components`. For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc. """ features: RelatedManager[Feature] """The features contained in the schema ← :attr:`~lamindb.Feature.schemas`.""" artifacts: RelatedManager[Artifact] """The artifacts with an inferred schema that matches this schema ← :attr:`~lamindb.Artifact.schemas`.""" validated_artifacts: Artifact """The artifacts that were validated against this schema ← :attr:`~lamindb.Artifact.schema`.""" projects: RelatedManager[Project] """Linked projects ← :attr:`~lamindb.Project.schemas`.""" records: RelatedManager[Record] """Records that were annotated with this schema ← :attr:`~lamindb.Record.schema`.""" ablocks: RelatedManager[SchemaBlock] """Attached blocks ← :attr:`~lamindb.SchemaBlock.schema`.""" @overload def __init__( self, features: list[SQLRecord] | SQLRecordList | list[tuple[Feature, dict]] | None = None, *, slots: dict[str, Schema] | None = None, name: str | None = None, description: str | None = None, itype: str | Registry | FieldAttr | None = None, type: Schema | None = None, is_type: bool = False, index: Feature | None = None, flexible: bool | None = None, otype: str | None = None, dtype: str | Type[int | float | str] | None = None, # noqa minimal_set: bool = True, maximal_set: bool = False, ordered_set: bool = False, coerce: bool | None = None, n_members: int | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None if len(args) > 1: raise ValueError("Only one non-keyword arg allowed: features") features: list[SQLRecord] | None = ( args[0] if args else kwargs.pop("features", []) ) index: Feature | None = kwargs.pop("index", None) slots: dict[str, Schema] = kwargs.pop("slots", {}) name: str | None = kwargs.pop("name", None) description: str | None = kwargs.pop("description", None) itype: str | SQLRecord | DeferredAttribute | None = kwargs.pop("itype", None) flexible: bool | None = kwargs.pop("flexible", None) type: Feature | None = kwargs.pop("type", None) is_type: bool = kwargs.pop("is_type", False) otype: str | None = kwargs.pop("otype", None) dtype: str | None = kwargs.pop("dtype", None) minimal_set: bool = kwargs.pop("minimal_set", True) ordered_set: bool = kwargs.pop("ordered_set", False) maximal_set: bool = kwargs.pop("maximal_set", False) if "coerce_dtype" in kwargs: warnings.warn( "`coerce_dtype` argument was renamed to `coerce` and will be removed in a future release.", DeprecationWarning, stacklevel=2, ) coerce_dtype = kwargs.pop("coerce_dtype") else: coerce_dtype = kwargs.pop("coerce", None) using: str | None = kwargs.pop("using", None) if "n" in kwargs: warnings.warn( "`n` argument was renamed to `n_members` and will be removed in a future release.", DeprecationWarning, stacklevel=2, ) n_features = kwargs.pop("n") else: n_features = kwargs.pop("n_members", None) kwargs.pop("branch", None) kwargs.pop("branch_id", 1) kwargs.pop("space", None) kwargs.pop("space_id", 1) # backward compat if not slots: if "components" in kwargs: logger.warning( "`components` as a keyword argument is deprecated, please use `slots` instead" ) slots = kwargs.pop("components") if kwargs: valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)]) raise FieldValidationError( f"Only {valid_keywords} are valid keyword arguments" ) ( features, validated_kwargs, optional_features, features_registry, flexible, ) = self._validate_kwargs_calculate_hash( features=features, index=index, slots=slots, name=name, description=description, itype=itype, flexible=flexible, type=type, is_type=is_type, otype=otype, dtype=dtype, minimal_set=minimal_set, ordered_set=ordered_set, maximal_set=maximal_set, coerce=coerce_dtype, n_features=n_features, ) if not features and not slots and not is_type and not itype: raise InvalidArgument( "Please pass features or slots or itype or set is_type=True" ) if not is_type: schema = ( Schema.objects.using(using) .filter( ~Q(branch_id=-1), hash=validated_kwargs["hash"], ) .one_or_none() ) if schema is not None: logger.important(f"returning schema with same hash: {schema}") init_self_from_db(self, schema) update_attributes(self, validated_kwargs) self.optionals.set(optional_features) return None self._slots: dict[str, Schema] = {} if features: self._features = (get_related_name(features_registry), features) # type: ignore if slots: for slot_key, component in slots.items(): if component._state.adding: raise InvalidArgument( f"schema for {slot_key} {component} must be saved before use" ) self._slots = slots if validated_kwargs["hash"] in KNOWN_SCHEMAS: validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]] else: validated_kwargs["uid"] = base62_16() super().__init__(**validated_kwargs) def query_schemas(self) -> QuerySet: """Query schemas of sub types. While `.schemas` retrieves the schemas with the current type, this method also retrieves sub types and the schemas with sub types of the current type. """ return _query_relatives([self], "schemas") # type: ignore def _validate_kwargs_calculate_hash( self, features: list[SQLRecord], index: Feature | None, slots: dict[str, Schema], name: str | None, description: str | None, itype: str | SQLRecord | DeferredAttribute | None, flexible: bool | None, type: Feature | None, is_type: bool, otype: str | None, dtype: str | None, minimal_set: bool, ordered_set: bool, maximal_set: bool, coerce: bool | None, n_features: int | None, optional_features_manual: list[Feature] | None = None, ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool]: optional_features = [] features_registry: Registry = None if itype is not None: if itype != "Composite": itype = serialize_dtype(itype, is_itype=True) else: warnings.warn( "please do not pass the deprecated itype='Composite'", stacklevel=2 ) if index is not None: if not isinstance(index, Feature): raise TypeError("index must be a Feature") features.insert(0, index) if features: features, configs = get_features_config(features) features_registry = validate_features(features) itype_compare = features_registry.__get_name_with_module__() if itype is not None: assert itype.startswith(itype_compare), str(itype_compare) # noqa: S101 else: itype = itype_compare if n_features is not None: if n_features != len(features): logger.important(f"updating to n {len(features)} features") n_features = len(features) if features_registry == Feature: optional_features = [ config[0] for config in configs if config[1].get("optional") ] if optional_features: assert optional_features_manual is None # noqa: S101 if not optional_features and optional_features_manual is not None: optional_features = optional_features_manual # n_features stays None if no features passed (flexible schema) if dtype is None: dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE else: dtype = get_type_str(dtype) if slots: if otype is None: raise InvalidArgument("Please pass otype != None for composite schemas") flexible_default = n_features is None if flexible is None: flexible = flexible_default if itype is not None and not isinstance(itype, str): itype_str = serialize_dtype(itype, is_itype=True) else: itype_str = itype validated_kwargs = { "name": name, "description": description, "type": type, "is_type": is_type, "_dtype_str": dtype, "otype": otype, "n_members": n_features, "itype": itype_str, "minimal_set": minimal_set, "ordered_set": ordered_set, "maximal_set": maximal_set, "coerce": coerce if coerce else None, "flexible": flexible, } n_features_default = ( None # None means flexible schema (no fixed number of features) ) coerce_default = False aux_dict: dict[str, dict[str, bool | str | list[str]]] = {} # optional features (key "1") - remains in _aux if optional_features: aux_dict.setdefault("af", {})["1"] = [f.uid for f in optional_features] # index feature (key "3") - remains in _aux if index is not None: aux_dict.setdefault("af", {})["3"] = index.uid if aux_dict: validated_kwargs["_aux"] = aux_dict HASH_CODE = { "_dtype_str": "a", "itype": "b", "minimal_set": "c", "ordered_set": "d", "maximal_set": "e", "flexible": "f", "coerce_dtype": "g", "n": "h", "optional": "i", "features_hash": "j", "index": "k", "slots_hash": "l", } # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash hash_args = ["_dtype_str", "itype", "minimal_set", "ordered_set", "maximal_set"] list_for_hashing = [ f"{HASH_CODE[arg]}={validated_kwargs[arg]}" for arg in hash_args if validated_kwargs[arg] is not None ] # only include in hash if not default so that it's backward compatible with records for which flexible was never set if flexible != flexible_default: list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}") if coerce is not None and coerce != coerce_default: list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce}") if n_features is not None and n_features != n_features_default: list_for_hashing.append(f"{HASH_CODE['n']}={n_features}") if index is not None: list_for_hashing.append(f"{HASH_CODE['index']}={index.uid}") if features: if optional_features: feature_list_for_hashing = [ feature.uid if feature not in set(optional_features) else f"{feature.uid}({HASH_CODE['optional']})" for feature in features ] else: feature_list_for_hashing = [feature.uid for feature in features] if not ordered_set: # order matters if ordered_set is True, if not sort feature_list_for_hashing = sorted(feature_list_for_hashing) features_hash = hash_string(":".join(feature_list_for_hashing)) list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}") if slots: slots_list_for_hashing = sorted( [f"{key}={component.hash}" for key, component in slots.items()] ) slots_hash = hash_string(":".join(slots_list_for_hashing)) list_for_hashing.append(f"{HASH_CODE['slots_hash']}={slots_hash}") if is_type: validated_kwargs["hash"] = None else: self._list_for_hashing = sorted(list_for_hashing) schema_hash = hash_string(":".join(self._list_for_hashing)) validated_kwargs["hash"] = schema_hash return ( features, validated_kwargs, optional_features, features_registry, flexible, ) @classmethod def from_values( # type: ignore cls, values: ListLike, field: FieldAttr = Feature.name, dtype: str | None = None, name: str | None = None, mute: bool = False, organism: SQLRecord | str | None = None, source: SQLRecord | None = None, raise_validation_error: bool = True, ) -> Schema: """Create feature set for validated features. Args: values: A list of values, like feature names or ids. field: The field of a reference registry to map values. dtype: The simple dtype. Defaults to `None` if reference registry is :class:`~lamindb.Feature`, defaults to `"float"` otherwise. name: A name. organism: An organism to resolve gene mapping. source: A public ontology to resolve feature identifier mapping. raise_validation_error: Whether to raise a validation error if some values are not valid. Raises: ValidationError: If some values are not valid. Example: :: import lamindb as ln import bionty as bt features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]] schema = ln.Schema.from_values(features) genes = ["ENSG00000139618", "ENSG00000198786"] schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float") """ if not isinstance(field, FieldAttr): raise TypeError( "Argument `field` must be a SQLRecord field, e.g., `Feature.name`" ) if len(values) == 0: raise ValueError("Provide a list of at least one value") if isinstance(values, DICT_KEYS_TYPE): values = list(values) registry = field.field.model if registry != Feature and dtype is None: dtype = NUMBER_TYPE logger.debug("setting feature set to 'number'") validated = registry.validate(values, field=field, mute=mute, organism=organism) values_array = np.array(values) validated_values = values_array[validated] if validated.sum() != len(values): not_validated_values = values_array[~validated] msg = ( f"These values could not be validated: {not_validated_values.tolist()}\n" f"If there are no typos, add them to their registry: {registry.__name__}" ) if raise_validation_error: raise ValidationError(msg) elif len(validated_values) == 0: return None # temporarily return None here validated_features = registry.from_values( validated_values, field=field, organism=organism, source=source, ) schema = Schema( features=validated_features, name=name, dtype=get_type_str(dtype), ) return schema @classmethod def from_dataframe( cls, df: pd.DataFrame, field: FieldAttr = Feature.name, name: str | None = None, mute: bool = False, organism: SQLRecord | str | None = None, source: SQLRecord | None = None, ) -> Schema | None: """Create schema for valid columns.""" registry = field.field.model validated = registry.validate( df.columns, field=field, mute=mute, organism=organism ) if validated.sum() == 0: if not mute: logger.warning("no validated features, skip creating schema") return None if registry == Feature: validated_features = Feature.from_values( # type: ignore df.columns, field=field, organism=organism ) schema = Schema( list(validated_features), name=name, dtype=None, otype="DataFrame" ) else: dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()] if len(set(dtypes)) != 1: raise ValueError(f"data types are heterogeneous: {set(dtypes)}") dtype = serialize_pandas_dtype(dtypes[0]) validated_features = registry.from_values( df.columns[validated], field=field, organism=organism, source=source, ) schema = Schema( features=list(validated_features), name=name, dtype=get_type_str(dtype), ) return schema @classmethod @deprecated("from_dataframe") def from_df( cls, df: pd.DataFrame, field: FieldAttr = Feature.name, name: str | None = None, mute: bool = False, organism: SQLRecord | str | None = None, source: SQLRecord | None = None, ) -> Schema | None: return cls.from_dataframe(df, field, name, mute, organism, source) def save(self, *args, **kwargs) -> Schema: """Save schema.""" from .save import bulk_create features_to_delete = [] print_hash_mutation_warning = kwargs.pop("print_hash_mutation_warning", True) if self.pk is not None: existing_features = self.members.to_list() if self.members.exists() else [] if hasattr(self, "_features"): features = self._features[1] if features != existing_features: features_to_delete = [ f for f in existing_features if f not in features ] else: features = existing_features index_feature = self.index index_feature_id = None if index_feature is None else index_feature.id _, validated_kwargs, _, _, _ = self._validate_kwargs_calculate_hash( features=[ # type: ignore f for f in features if index_feature_id is None or f.id != index_feature_id ], index=index_feature, slots=self.slots, name=self.name, description=self.description, itype=self.itype, flexible=self.flexible, type=self.type, is_type=self.is_type, otype=self.otype, dtype=self.dtype, minimal_set=self.minimal_set, ordered_set=self.ordered_set, maximal_set=self.maximal_set, coerce=self.coerce, n_features=self.n_members, optional_features_manual=self.optionals.get(), ) if validated_kwargs["hash"] != self.hash: from .artifact import Artifact datasets = Artifact.filter(schema=self) if datasets.exists(): if features_to_delete: logger.warning( f"you're removing these features: {features_to_delete}" ) if print_hash_mutation_warning: logger.warning( f"you updated the schema hash and might invalidate datasets that were previously validated with this schema:\n{datasets.to_dataframe()}" ) self.hash = validated_kwargs["hash"] self.n_members = validated_kwargs["n_members"] super().save(*args, **kwargs) if hasattr(self, "_slots"): # analogous to save_schema_links in core._data.py # which is called to save feature sets in artifact.save() links = [] for slot, component in self._slots.items(): kwargs = { "composite_id": self.id, "component_id": component.id, "slot": slot, } links.append(Schema.components.through(**kwargs)) bulk_create(links, ignore_conflicts=True) delattr(self, "_slots") if hasattr(self, "_features"): assert self.n_members > 0 # noqa: S101 using: bool | None = kwargs.pop("using", None) related_name, records = self._features # self.related_name.set(features) does **not** preserve the order # but orders by the feature primary key # hence we need the following more complicated logic through_model = getattr(self, related_name).through if self.itype == "Composite": related_model_split = ["Feature"] else: related_model_split = parse_cat_dtype(self.itype, is_itype=True)[ "registry_str" ].split(".") if len(related_model_split) == 1: related_field = related_model_split[0].lower() else: related_field = related_model_split[1].lower() related_field_id = f"{related_field}_id" links = [ through_model(**{"schema_id": self.id, related_field_id: record.id}) for record in records ] through_model.objects.using(using).bulk_create(links, ignore_conflicts=True) getattr(self, related_name).remove(*features_to_delete) delattr(self, "_features") return self @property def members(self) -> QuerySet: """A queryset for the individual records in the feature set underlying the schema. Unlike the many-to-many fields `schema.features`, `schema.genes`, `schema.proteins`, `.members` 1. returns an ordered `QuerySet` if the schema is saved or a `SQLRecordList` if the schema is unsaved 2. doesn't require knowledge of the registry storing the feature identifiers (`ln.Feature`, `bt.Gene`, `bt.Protein`, etc.) 3. works for a dynamically created (unsaved) schema """ if self._state.adding: # this should return a queryset and not a list... # need to fix this return SQLRecordList(self._features[1]) # type: ignore if self.itype == "Composite" or self.is_type: return Feature.objects.none() related_name = self._get_related_name() if related_name is None: related_name = "features" related_manager = self.__getattribute__(related_name) through_model = related_manager.through using = self._state.db related_fk_name = next( field.name for field in through_model._meta.fields if isinstance(field, models.ForeignKey) and field.name != "schema" ) # Avoid the previous simple `order_by("links_schema__id")` on the related # manager: a member can be linked to many schemas, and reverse-join ordering # can become ambiguous across DB backends (SQLite vs Postgres). Instead, we # order through rows constrained to this schema and preserve that exact order. member_ids = list( through_model.objects.using(using) .filter(schema_id=self.id) .order_by("id") .values_list(f"{related_fk_name}_id", flat=True) ) if not member_ids: return related_manager.model.objects.using(using).none() preserved_order = models.Case( *[ models.When(id=member_id, then=models.Value(idx)) for idx, member_id in enumerate(member_ids) ], output_field=models.IntegerField(), ) # Order by ids from the through table constrained to this schema to avoid # ambiguous reverse-join ordering when a member is linked to many schemas. return ( related_manager.model.objects.using(using) .filter(id__in=member_ids) .order_by(preserved_order) ) @property def dtype(self) -> str | None: """The `dtype` for all features in the schema.""" return self._dtype_str @dtype.setter def dtype(self, value: str | None) -> None: self._dtype_str = value @property @deprecated("coerce") def coerce_dtype(self) -> bool | None: """Alias for coerce (backward compatibility).""" return self.coerce @coerce_dtype.setter def coerce_dtype(self, value: bool | None) -> None: self.coerce = value @property @deprecated("n_members") def n(self) -> int | None: """Alias for n_members (backward compatibility).""" return self.n_members @n.setter def n(self, value: int | None) -> None: self.n_members = value @property def index(self) -> None | Feature: """The feature configured to act as index. To unset it, set `schema.index` to `None`. """ if self._index_feature_uid is None: return None if hasattr(self, "_features"): _, features = self._features for feature in features: if feature.uid == self._index_feature_uid: return feature return self.features.get(uid=self._index_feature_uid) @index.setter def index(self, value: None | Feature) -> None: if value is None: current_index = self.index self.features.remove(current_index) self._index_feature_uid = value else: self.features.add(value) self._index_feature_uid = value.uid @property def _index_feature_uid(self) -> None | str: """The uid of the index feature.""" if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]: return self._aux["af"]["3"] else: return None @_index_feature_uid.setter def _index_feature_uid(self, value: str | None) -> None: self._aux = self._aux or {} if value is None: self._aux.get("af", {}).pop("3") else: self._aux.setdefault("af", {})["3"] = value @property def slots(self) -> dict[str, Schema]: """Slots. Examples: :: # define composite schema anndata_schema = ln.Schema( name="mini_immuno_anndata_schema", otype="AnnData", slots={"obs": obs_schema, "var": var_schema}, ).save() # access slots anndata_schema.slots #> {'obs': , 'var': } """ if hasattr(self, "_slots"): return self._slots self._slots = { link.slot: link.component for link in self.components.through.filter(composite_id=self.id) } return self._slots @property def optionals(self) -> SchemaOptionals: """Manage optional features. Example: :: # a schema with optional "sample_name" schema_optional_sample_name = ln.Schema( features=[ ln.Feature(name="sample_id", dtype=str).save(), # required ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True), # optional ], ).save() # raise ValidationError since `sample_id` is required ln.curators.DataFrameCurator( pd.DataFrame( { "sample_name": ["Sample 1", "Sample 2"], } ), schema=schema_optional_sample_name).validate() ) # passes because an optional column is missing ln.curators.DataFrameCurator( pd.DataFrame( { "sample_id": ["sample1", "sample2"], } ), schema=schema_optional_sample_name).validate() ) """ return SchemaOptionals(self) def add_optional_features(self, features: list[Feature]) -> None: """Add optional features to the schema.""" self.features.add(*features) self.optionals.add(features) self.save(print_hash_mutation_warning=False) def remove_optional_features(self, features: list[Feature]) -> None: """Remove optional features from the schema.""" optional_features = self.optionals.get() for feature in features: assert feature in optional_features, f"Feature {feature} is not optional" self.features.remove(*features) self.optionals.remove(features) self.save(print_hash_mutation_warning=False) @class_and_instance_method def describe(cls_or_self, return_str: bool = False) -> None | str: """Describe schema.""" if isinstance(cls_or_self, type): return type(cls_or_self).describe(cls_or_self) # type: ignore if cls_or_self.pk is None: raise ValueError("Schema must be saved before describing") tree = describe_schema(cls_or_self) for slot, schema in cls_or_self.slots.items(): tree.add(describe_schema(schema, slot=slot)) return format_rich_tree(tree, return_str=return_str) def get_type_str(dtype: str | None) -> str | None: if dtype is not None: type_str = dtype.__name__ if not isinstance(dtype, str) else dtype # type: ignore else: type_str = None return type_str def _get_related_name(self: Schema) -> str | None: related_models = dict_related_model_to_related_name(self, instance=self._state.db) if self.itype: related_name = related_models.get( parse_cat_dtype(self.itype, is_itype=True)["registry_str"] ) return related_name return None class SchemaFeature(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature") feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema") class Meta: app_label = "lamindb" unique_together = ("schema", "feature") class ArtifactSchema(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="_links_schema") schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact") slot: str | None = CharField(null=True) feature_ref_is_semantic: bool | None = BooleanField(null=True) class Meta: app_label = "lamindb" unique_together = (("artifact", "schema"), ("artifact", "slot")) class SchemaComponent(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_component") component: Schema = ForeignKey(Schema, PROTECT, related_name="links_composite") slot: str | None = CharField(null=True) class Meta: app_label = "lamindb" unique_together = (("composite", "slot", "component"), ("composite", "slot")) Schema._get_related_name = _get_related_name # PostgreSQL migration helpers for auxiliary fields # These are used by migrations to efficiently migrate data from _aux to Django fields def migrate_auxiliary_fields_postgres(schema_editor) -> None: """Migrate _aux['af'] fields to Django fields using PostgreSQL raw SQL. This efficiently migrates auxiliary fields for all affected models: **Artifact:** - _save_completed from _aux['af']['0'] **Run:** - cli_args from _aux['af']['0'] **Feature:** - default_value from _aux['af']['0'] - nullable from _aux['af']['1'] (default: True) - coerce from _aux['af']['2'] (default: False) - For type features (is_type=True), all values are set to NULL **Schema:** - coerce from _aux['af']['0'] - flexible from _aux['af']['2'] (or computed from n_members) - n_members (converted from negative to NULL) - For type schemas (is_type=True), all values are set to NULL - Keys '1' (optionals) and '3' (index_feature_uid) are preserved in _aux """ # Artifact: migrate _save_completed from _aux->'af'->'0' schema_editor.execute(""" UPDATE lamindb_artifact SET _save_completed = (_aux->'af'->>'0')::boolean, _aux = CASE WHEN _aux->'af' IS NOT NULL THEN CASE WHEN _aux - 'af' = '{}'::jsonb THEN NULL ELSE _aux - 'af' END ELSE _aux END WHERE _aux IS NOT NULL AND _aux->'af' IS NOT NULL """) # Run: migrate cli_args from _aux->'af'->'0' schema_editor.execute(""" UPDATE lamindb_run SET cli_args = _aux->'af'->>'0', _aux = CASE WHEN _aux - 'af' = '{}'::jsonb THEN NULL ELSE _aux - 'af' END WHERE _aux IS NOT NULL AND _aux ? 'af' """) # Feature: migrate default_value, nullable, coerce # For type features: set all to NULL schema_editor.execute(""" UPDATE lamindb_feature SET default_value = NULL, nullable = NULL, coerce = NULL, _aux = CASE WHEN _aux->'af' IS NOT NULL THEN CASE WHEN _aux - 'af' = '{}'::jsonb THEN NULL ELSE _aux - 'af' END ELSE _aux END WHERE is_type = TRUE """) # For regular features: migrate values with defaults schema_editor.execute(""" UPDATE lamindb_feature SET default_value = _aux->'af'->'0', nullable = COALESCE((_aux->'af'->>'1')::boolean, TRUE), coerce = COALESCE((_aux->'af'->>'2')::boolean, FALSE), _aux = CASE WHEN _aux->'af' IS NOT NULL THEN CASE WHEN _aux - 'af' = '{}'::jsonb THEN NULL ELSE _aux - 'af' END ELSE _aux END WHERE is_type = FALSE OR is_type IS NULL """) # Schema: migrate coerce, flexible, n_members # For type schemas: set all to NULL schema_editor.execute(""" UPDATE lamindb_schema SET coerce = NULL, flexible = NULL, n_members = NULL, _aux = CASE WHEN _aux->'af' IS NOT NULL THEN CASE WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END ELSE jsonb_set(_aux #- ARRAY['af'], '{af}', (_aux->'af') #- ARRAY['0'] #- ARRAY['2']) END ELSE _aux END WHERE is_type = TRUE """) # For regular schemas: migrate values # Keep '1' (optionals) and '3' (index_feature_uid) in _aux schema_editor.execute(""" UPDATE lamindb_schema SET coerce = (_aux->'af'->>'0')::boolean, flexible = COALESCE( (_aux->'af'->>'2')::boolean, n_members IS NULL OR n_members < 0 ), n_members = CASE WHEN n_members < 0 THEN NULL ELSE n_members END, _aux = CASE WHEN _aux->'af' IS NOT NULL THEN CASE WHEN ((_aux->'af') #- ARRAY['0'] #- ARRAY['2']) = '{}'::jsonb THEN CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN NULL ELSE _aux #- ARRAY['af'] END ELSE jsonb_set( CASE WHEN (_aux #- ARRAY['af']) = '{}'::jsonb THEN '{}'::jsonb ELSE _aux #- ARRAY['af'] END, '{af}', (_aux->'af') #- ARRAY['0'] #- ARRAY['2'] ) END ELSE _aux END WHERE is_type = FALSE OR is_type IS NULL """) ================================================ FILE: lamindb/models/sqlrecord.py ================================================ from __future__ import annotations import builtins import gzip import inspect import os import re import shutil import sys from collections import defaultdict from itertools import chain from pathlib import Path from typing import ( TYPE_CHECKING, Any, Literal, NamedTuple, TypeVar, overload, ) import dj_database_url import lamindb_setup as ln_setup from django.core.exceptions import ValidationError as DjangoValidationError from django.db import IntegrityError, ProgrammingError, connections, models, transaction from django.db.models import CASCADE, DEFERRED, PROTECT, Field, Manager, QuerySet from django.db.models import ForeignKey as django_ForeignKey from django.db.models.base import ModelBase from django.db.models.fields.related import ( ManyToManyField, ManyToManyRel, ManyToOneRel, ) from django.db.models.functions import Lower from lamin_utils import colors, logger from lamindb_setup import settings as setup_settings from lamindb_setup._connect_instance import ( INSTANCE_NOT_FOUND_MESSAGE, InstanceNotFoundError, get_owner_name_from_identifier, load_instance_settings, update_db_using_local, ) from lamindb_setup.core._docs import doc_args from lamindb_setup.core._hub_core import connect_instance_hub from lamindb_setup.core._settings_store import instance_settings_file from lamindb_setup.core.django import DBToken, db_token_manager from upath import UPath from lamindb.base.users import current_user_id from lamindb.base.utils import class_and_instance_method, deprecated from ..base.fields import ( BooleanField, CharField, DateTimeField, ForeignKey, JSONField, TextField, ) from ..base.types import ( BRANCH_CODE_TO_STATUS, BRANCH_STATUS_TO_CODE, BranchStatus, FieldAttr, StrField, ) from ..base.uids import base62_12 from ..errors import ( FieldValidationError, NoWriteAccess, ValidationError, ) from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned from .query_manager import QueryManager, _lookup, _search if TYPE_CHECKING: from datetime import datetime import pandas as pd from .block import BranchBlock, SpaceBlock from .project import Project from .query_manager import RelatedManager from .query_set import SQLRecordList from .run import Run, User from .ulabel import ULabel T = TypeVar("T", bound="SQLRecord") IPYTHON = getattr(builtins, "__IPYTHON__", False) UNIQUE_FIELD_NAMES = { "root", "ontology_id", "uid", "scientific_name", "ensembl_gene_id", "uniprotkb_id", } BRANCH_SENSITIVE_BLOCK_MODEL_NAMES = frozenset( { "RecordBlock", "ArtifactBlock", "TransformBlock", "CollectionBlock", "RunBlock", "SchemaBlock", "FeatureBlock", "ProjectBlock", "ULabelBlock", "SpaceBlock", } ) def _is_branch_sensitive_model(model: type[BaseSQLRecord]) -> bool: return ( issubclass(model, SQLRecord) and model.__name__ not in {"Storage", "Source"} ) or model.__name__ in BRANCH_SENSITIVE_BLOCK_MODEL_NAMES # ------------------------------------------------------------------------------------- # A note on required fields at the SQLRecord level # # As Django does most of its validation on the Form-level, it doesn't offer functionality # for validating the integrity of an SQLRecord object upon instantation (similar to pydantic) # # For required fields, we define them as commonly done on the SQL level together # with a validator in SQLRecord (validate_required_fields) # # This goes against the Django convention, but goes with the SQLModel convention # (Optional fields can be null on the SQL level, non-optional fields cannot) # # Due to Django's convention where CharFieldAttr has pre-configured (null=False, default=""), marking # a required field necessitates passing `default=None`. Without the validator it would trigger # an error at the SQL-level, with it, it triggers it at instantiation # ------------------------------------------------------------------------------------- # A note on class and instance methods of core SQLRecord # # All of these are defined and tested within lamindb, in files starting with _{orm_name}.py # ------------------------------------------------------------------------------------- # A note on maximal lengths of char fields # # 100 characters: # "Raindrops pitter-pattered on the windowpane, blurring the" # "city lights outside, curled up with a mug." # A good maximal length for a name (title). # # 150 characters: We choose this for name maximal length because some users like long names. # # 255 characters: # "In creating a precise 255-character paragraph, one engages in" # "a dance of words, where clarity meets brevity. Every syllable counts," # "illustrating the skill in compact expression, ensuring the essence of the" # "message shines through within the exacting limit." class IsLink: pass class HasType(models.Model): """Mixin for registries that have a hierarchical `type` assigned. Such registries have a `.type` foreign key pointing to themselves. A `type` hence allows hierarchically grouping records under types. For instance, using the example of `ln.Record`:: experiment_type = ln.Record(name="Experiment", is_type=True).save() experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save() experiment2 = ln.Record(name="Experiment 2", type=experiment_type).save() """ class Meta: abstract = True is_type: bool = BooleanField(default=False, db_default=False, db_index=True) """Indicates if record is a `type`. For example, if a record "Compound" is a `type`, the actual compounds "darerinib", "tramerinib", would be instances of that `type`. """ def query_types(self) -> SQLRecordList: """Query types of a record recursively. While `.type` retrieves the `type`, this method retrieves all super types of that `type`:: # Create type hierarchy type1 = model_class(name="Type1", is_type=True).save() type2 = model_class(name="Type2", is_type=True, type=type1).save() type3 = model_class(name="Type3", is_type=True, type=type2).save() # Create a record with type3 record = model_class(name=f"{model_name}3", type=type3).save() # Query super types super_types = record.query_types() assert super_types[0] == type3 assert super_types[1] == type2 assert super_types[2] == type1 """ from .has_parents import _query_ancestors_of_fk return _query_ancestors_of_fk(self, "type") # type: ignore def deferred_attribute__repr__(self): return f"FieldAttr({self.field.model.__name__}.{self.field.name})" def unique_constraint_error_in_error_message(error_msg: str) -> bool: """Check if the error message indicates a unique constraint violation.""" return ( "UNIQUE constraint failed" in error_msg # SQLite or "duplicate key value violates unique constraint" in error_msg # Postgre ) def parse_violated_field_from_error_message(error_msg: str) -> list[str] | None: # Even if the model has multiple fields with unique=True, # Django will only raise an IntegrityError for one field at a time # - whichever constraint is violated first during the database insert/update operation. if unique_constraint_error_in_error_message(error_msg): if "UNIQUE constraint failed" in error_msg: # sqlite constraint_field = ( error_msg.removeprefix("UNIQUE constraint failed: ") .split(", ")[0] .split(".")[-1] ) return [constraint_field] else: # postgres # Extract constraint name from double quotes constraint_name = error_msg.split('"')[1] # Check if it's a multi-column constraint (contains multiple field names) # Format: tablename_field1_field2_..._hash_uniq if "_uniq" in constraint_name: # Remove '_uniq' suffix first constraint_name = constraint_name.removesuffix("_uniq") # Remove hash (8 hex characters at the end) parts = constraint_name.split("_") if len(parts[-1]) == 8 and all( c in "0123456789abcdef" for c in parts[-1] ): constraint_name = "_".join(parts[:-1]) # Remove table name prefix (e.g., "bionty_ethnicity_") # Table name is typically the first 2 parts for app_model format parts = constraint_name.split("_") if len(parts) > 2: # Assume first 2 parts are table name (e.g., "bionty_ethnicity") field_string = "_".join(parts[2:]) else: field_string = constraint_name # Now parse the fields from DETAIL line # DETAIL: Key (name, ontology_id)=(South Asian, HANCESTRO:0006) already exists. if "Key (" in error_msg: fields_part = error_msg.split("Key (")[1].split(")=")[0] fields = [f.strip() for f in fields_part.split(",")] return fields # Fallback if DETAIL line not available return [field_string] else: # Single field constraint (ends with _key) constraint_field = constraint_name.removesuffix("_key").split("_")[-1] return [constraint_field] return None FieldAttr.__repr__ = deferred_attribute__repr__ # type: ignore class ValidateFields: pass def is_approx_pascal_case(s: str) -> bool: """Check if the last component of a dotted string is in PascalCase. Args: s: The string to check """ if "[" in s: # this is because we allow types of form 'script[test_script.py]' return True last_component = s.split(".")[-1] return last_component[:1].isupper() and "_" not in last_component def init_self_from_db(self: SQLRecord, existing_record: SQLRecord): from .run import current_run new_args = [ getattr(existing_record, field.attname) for field in self._meta.concrete_fields ] super(self.__class__, self).__init__(*new_args) self._state.adding = False # mimic from_db self._state.db = "default" # if run was not set on the existing record, set it to the current_run if hasattr(self, "run_id") and self.run_id is None and current_run() is not None: logger.warning(f"run was not set on {self}, setting to current run") self.run = current_run() def update_attributes(record: SQLRecord, attributes: dict[str, str]): for key, value in attributes.items(): if getattr(record, key) != value and value is not None: if key not in {"uid", "_dtype_str", "otype", "hash"}: logger.warning(f"updated {key} from {getattr(record, key)} to {value}") setattr(record, key, value) else: hash_message = ( "recomputing on .save()" if key == "hash" else f"keeping {getattr(record, key)}" ) logger.debug( f"ignoring tentative value {value} for {key}, {hash_message}" ) def validate_literal_fields(record: SQLRecord, kwargs) -> None: """Validate all Literal type fields in a record. Args: record: record being validated Raises: ValidationError: If any field value is not in its Literal's allowed values """ if isinstance(record, IsLink): return None if record.__class__.__name__ in "Feature": return None from lamindb.base.types import ArtifactKind, Dtype, TransformKind types = { "TransformKind": TransformKind, "ArtifactKind": ArtifactKind, "Dtype": Dtype, } errors = {} annotations = getattr(record.__class__, "__annotations__", {}) for field_name, annotation in annotations.items(): if field_name not in kwargs or kwargs[field_name] is None: continue value = kwargs[field_name] if str(annotation) in types: annotation = types[annotation] if not hasattr(annotation, "__origin__"): continue literal_type = annotation if annotation.__origin__ is Literal else None if literal_type is None: continue valid_values = set(literal_type.__args__) if value not in valid_values: errors[field_name] = ( f"{field_name}: {colors.yellow(value)} is not a valid value" f"\n → Valid values are: {colors.green(', '.join(sorted(valid_values)))}" ) if errors: message = "\n " for _, error in errors.items(): message += error + "\n " raise FieldValidationError(message) def validate_fields(record: SQLRecord, kwargs): from lamindb.models import ( Artifact, Collection, Feature, Run, Schema, Transform, ULabel, ) # validate required fields # a "required field" is a Django field that has `null=False, default=None` required_fields = { k.name for k in record._meta.fields if not k.null and k.default is None } required_fields_not_passed = {k: None for k in required_fields if k not in kwargs} kwargs.update(required_fields_not_passed) missing_fields = [ k for k, v in kwargs.items() if v is None and k in required_fields ] if missing_fields: raise FieldValidationError(f"{missing_fields} are required.") # ensure the exact length of the internal uid for core entities if "uid" in kwargs and record.__class__ in { Artifact, Collection, Transform, Run, ULabel, Feature, Schema, }: uid_max_length = record.__class__._meta.get_field( "uid" ).max_length # triggers FieldDoesNotExist if len(kwargs["uid"]) != uid_max_length: # triggers KeyError if not ( record.__class__ is Schema and len(kwargs["uid"]) == 16 ): # no error for schema raise ValidationError( f"`uid` must be exactly {uid_max_length} characters long, got {len(kwargs['uid'])}." ) # validate is_type if "is_type" in kwargs and "name" in kwargs and kwargs["is_type"]: is_approx_pascal_case(kwargs["name"]) if ( "type" in kwargs and isinstance(kwargs["type"], HasType) and not kwargs["type"].is_type ): object_name = record.__class__.__name__.lower() raise ValueError( f"You can only assign a {object_name} with `is_type=True` as `type` to another {object_name}, but this doesn't have it: {kwargs['type']}" ) # validate literals validate_literal_fields(record, kwargs) def suggest_records_with_similar_names( record: SQLRecord, name_field: str, kwargs ) -> SQLRecord | None: """Returns a record if found exact match, otherwise None. Logs similar matches if found. """ if kwargs.get(name_field) is None or not isinstance(kwargs.get(name_field), str): return None # need to perform an additional request to find the exact match # previously, this was inferred from the truncated/fuzzy search below # but this isn't reliable: https://laminlabs.slack.com/archives/C04FPE8V01W/p1737812808563409 # the below needs to be .first() because there might be multiple records with the same # name field in case the record is versioned (e.g. for Transform key) if isinstance(record, HasType): if kwargs.get("type", None) is None: subset = record.__class__.filter(type__isnull=True) else: subset = record.__class__.filter(type=kwargs["type"]) else: subset = record.__class__ exact_match = subset.filter(**{name_field: kwargs[name_field]}).first() if exact_match is not None: return exact_match queryset = _search( subset, kwargs[name_field], field=name_field, truncate_string=True, limit=3, ) if not queryset.exists(): # empty queryset return None s, it, nots, record_text = ( ("", "it", "s", "a record") if len(queryset) == 1 else ("s", "one of them", "", "records") ) similar_names = ", ".join(f"'{getattr(record, name_field)}'" for record in queryset) msg = f"you are trying to create a record with name='{kwargs[name_field]}' but {record_text} with similar {name_field}{s} exist{nots}: {similar_names}. Did you mean to load {it}?" logger.warning(f"{msg}") return None def delete_record(record: BaseSQLRecord, is_soft: bool = True): def delete(): if is_soft: record.branch_id = -1 record.save() return None else: return super(BaseSQLRecord, record).delete() # deal with versioned records # if _overwrite_versions = True, there is only a single version and # no need to set the new latest version because all versions are deleted # when deleting the latest version if ( isinstance(record, IsVersioned) and record.is_latest and not getattr(record, "_overwrite_versions", False) ): promoted = _adjust_is_latest_when_deleting_is_versioned(record) if promoted: if is_soft: record.is_latest = False with transaction.atomic(): result = delete() return result # deal with all other cases of the nested if condition now return delete() RECORD_REGISTRY_EXAMPLE = """Example:: from lamindb import SQLRecord, fields # sub-classing `SQLRecord` creates a new registry class Experiment(SQLRecord): name: str = fields.CharField() # instantiating `Experiment` creates a record `experiment` experiment = Experiment(name="my experiment") # you can save the record to the database experiment.save() # `Experiment` refers to the registry, which you can query df = Experiment.filter(name__startswith="my ").to_dataframe() """ def _synchronize_clone(storage_root: str) -> str | None: """Synchronizes a clone to the local SQLite path. Args: storage_root: The storage root path of the (target) instance """ cloud_db_path = UPath(storage_root) / ".lamindb" / "lamin.db" local_sqlite_path = ln_setup.settings.cache_dir / cloud_db_path.path.lstrip("/") local_sqlite_path.parent.mkdir(parents=True, exist_ok=True) cloud_db_path_gz = UPath(str(cloud_db_path) + ".gz", anon=True) local_sqlite_path_gz = Path(str(local_sqlite_path) + ".gz") try: if cloud_db_path_gz.synchronize_to( local_sqlite_path_gz, error_no_origin=True, print_progress=True ): with ( gzip.open(local_sqlite_path_gz, "rb") as f_in, open(local_sqlite_path, "wb") as f_out, ): shutil.copyfileobj(f_in, f_out) return f"sqlite:///{local_sqlite_path}" except (FileNotFoundError, PermissionError): logger.debug("Clone not found. Falling back to normal access...") return None # this is the metaclass for SQLRecord @doc_args(RECORD_REGISTRY_EXAMPLE) class Registry(ModelBase): """Metaclass for :class:`~lamindb.models.SQLRecord`. Each `Registry` *object* is a `SQLRecord` *class* and corresponds to a table in the metadata SQL database. You work with `Registry` objects whenever you use *class methods* of `SQLRecord`. You call any subclass of `SQLRecord` a "registry" and their objects "records". A `SQLRecord` object corresponds to a row in the SQL table. If you want to create a new registry, you sub-class `SQLRecord`. {} Note: `Registry` inherits from Django's `ModelBase`. """ _available_fields: set[str] = None def __new__(cls, name, bases, attrs, **kwargs): new_class = super().__new__(cls, name, bases, attrs, **kwargs) return new_class # below creates a sensible auto-complete behavior that differs across the # class and instance level in Jupyter Editors it doesn't have any effect for # static type analyzer like pylance used in VSCode def __dir__(cls): # this is needed to bring auto-complete on the class-level back # https://laminlabs.slack.com/archives/C04FPE8V01W/p1717535625268849 # Filter class attributes, excluding instance methods exclude_instance_methods = "sphinx" not in sys.modules # https://laminlabs.slack.com/archives/C04FPE8V01W/p1721134595920959 def include_attribute(attr_name, attr_value): if attr_name.startswith("__"): return False if exclude_instance_methods and callable(attr_value): return isinstance(attr_value, (classmethod, staticmethod, type)) return True # check also inherited attributes if hasattr(cls, "mro"): attrs = chain(*(c.__dict__.items() for c in cls.mro())) else: attrs = cls.__dict__.items() result = [] for attr_name, attr_value in attrs: if attr_name not in result and include_attribute(attr_name, attr_value): result.append(attr_name) # Add non-dunder attributes from Registry for attr in dir(Registry): if not attr.startswith("__") and attr not in result: result.append(attr) return result def describe(cls, return_str: bool = False) -> str | None: """Describe the fields of the registry.""" from ._describe import strip_ansi_from_string as _strip_ansi repr_str = f"{colors.green(cls.__name__)}\n" info = SQLRecordInfo(cls) repr_str += info.get_simple_fields(return_str=True) repr_str += info.get_relational_fields(return_str=True) repr_str = repr_str.rstrip("\n") if return_str: return _strip_ansi(repr_str) else: print(repr_str) return None @doc_args(_lookup.__doc__) def lookup( cls, field: StrField | None = None, return_field: StrField | None = None, keep: Literal["first", "last", False] = "first", ) -> NamedTuple: """{}""" # noqa: D415 return _lookup(cls=cls, field=field, return_field=return_field, keep=keep) def filter(cls, *queries, **expressions) -> QuerySet: """Query records. Args: queries: One or multiple `Q` objects. expressions: Fields and values passed as Django query expressions. See Also: - Guide: :doc:`docs:registries` - Django documentation: `Queries `__ Examples: >>> ln.Project(name="my label").save() >>> ln.Project.filter(name__startswith="my").to_dataframe() """ from .query_set import QuerySet _using_key = None if "_using_key" in expressions: _using_key = expressions.pop("_using_key") return QuerySet(model=cls, using=_using_key).filter(*queries, **expressions) def get( cls: type[T], idlike: int | str | None = None, **expressions, ) -> T: """Get a single record. Args: idlike: Either a uid stub, uid or an integer id. expressions: Fields and values passed as Django query expressions. Raises: :exc:`lamindb.errors.ObjectDoesNotExist`: In case no matching record is found. See Also: - Guide: :doc:`registries` - Django documentation: `Queries `__ Examples: :: record = ln.Record.get("FvtpPJLJ") record = ln.Record.get(name="my-label") """ from .query_set import QuerySet return QuerySet(model=cls).get(idlike, **expressions) def to_dataframe( cls, *, include: str | list[str] | None = None, features: str | list[str] | None = None, limit: int | None = 100, order_by: str | None = "-id", ) -> pd.DataFrame: """Evaluate and convert to `pd.DataFrame`. By default, this returns up to 100 rows for a fast overview. Pass `limit=None` to fetch all matching records. By default, maps simple fields and foreign keys onto `DataFrame` columns. Guide: :doc:`docs:registries` Args: include: Related data to include as columns. Takes strings of form `"records__name"`, `"cell_types__name"`, etc. or a list of such strings. For `Artifact`, `Record`, and `Run`, can also pass `"features"` to include features with data types pointing to entities in the core schema. If `"privates"`, includes private fields (fields starting with `_`). features: Configure the features to include. Can be a feature name or a list of such names. If `"queryset"`, infers the features used within the current queryset. Only available for `Artifact`, `Record`, and `Run`. limit: Maximum number of rows to display. Defaults to 100. If `None`, includes all results. order_by: Field name to order the records by. Prefix with '-' for descending order. Defaults to '-id' to get the most recent records. This argument is ignored if the queryset is already ordered or if the specified field does not exist. Examples: Include the name of the creator:: ln.Record.to_dataframe(include="created_by__name"]) Include features:: ln.Artifact.to_dataframe(include="features") Include selected features:: ln.Artifact.to_dataframe(features=["cell_type_by_expert", "cell_type_by_model"]) """ return cls.filter().to_dataframe( include=include, features=features, order_by=order_by, limit=limit ) @deprecated(new_name="to_dataframe") def df( cls, *, include: str | list[str] | None = None, features: str | list[str] | None = None, limit: int | None = 100, order_by: str | None = "-id", ) -> pd.DataFrame: return cls.to_dataframe( include=include, features=features, limit=limit, order_by=order_by ) @doc_args(_search.__doc__) def search( cls, string: str, *, field: StrField | None = None, limit: int | None = 20, case_sensitive: bool = False, ) -> QuerySet: """{}""" # noqa: D415 return _search( cls=cls, string=string, field=field, limit=limit, case_sensitive=case_sensitive, ) @deprecated(new_name="connect") def using( cls, instance: str | None, ) -> QuerySet: return cls.connect( instance=instance, ) def connect( cls, instance: str | None, ) -> QuerySet: """Query a non-default LaminDB instance. Args: instance: An instance identifier of form "account_handle/instance_name". Examples: :: ln.Record.connect("account_handle/instance_name").search("label7", field="name") """ from .query_set import QuerySet # we're in the default instance if instance is None or instance == "default": return QuerySet(model=cls, using=None) # connection already established if instance in connections: return QuerySet(model=cls, using=instance) owner, name = get_owner_name_from_identifier(instance) current_instance_owner_name: list[str] = setup_settings.instance.slug.split("/") # move on to different instances cache_using_filepath = ( setup_settings.cache_dir / f"instance--{owner}--{name}--uid.txt" ) settings_file = instance_settings_file(name, owner) if not settings_file.exists(): result = connect_instance_hub(owner=owner, name=name) if isinstance(result, str): message = INSTANCE_NOT_FOUND_MESSAGE.format( owner=owner, name=name, hub_result=result ) raise InstanceNotFoundError(message) iresult, storage = result # this can happen if querying via an old instance name if [iresult.get("owner"), iresult["name"]] == current_instance_owner_name: return QuerySet(model=cls, using=None) # do not use {} syntax below, it gives rise to a dict if the schema modules # are empty and then triggers a TypeError in missing_members = source_modules - target_modules source_modules = set( # noqa [mod for mod in iresult["schema_str"].split(",") if mod != ""] ) # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed db = None if ( "_public" in iresult["db_user_name"] and "postgresql" in iresult["db_scheme"] ): db = _synchronize_clone(storage["root"]) if db is None: if [ iresult.get("owner"), iresult["name"], ] == current_instance_owner_name: return QuerySet(model=cls, using=None) db = update_db_using_local(iresult, settings_file) is_fine_grained_access = ( iresult["fine_grained_access"] and iresult["db_permissions"] == "jwt" ) else: is_fine_grained_access = False cache_using_filepath.write_text( f"{iresult['lnid']}\n{iresult['schema_str']}", encoding="utf-8" ) # access_db can take both: the dict from connect_instance_hub and isettings into_db_token = iresult else: isettings = load_instance_settings(settings_file) source_modules = isettings.modules db = None if "public" in isettings.db and isettings.dialect == "postgresql": db = _synchronize_clone(isettings.storage.root_as_str) # Try to connect to a clone if targeting a public instance but fall back to normal access if access failed if db is None: if [isettings.owner, isettings.name] == current_instance_owner_name: return QuerySet(model=cls, using=None) db = isettings.db is_fine_grained_access = ( isettings._fine_grained_access and isettings._db_permissions == "jwt" ) else: is_fine_grained_access = False cache_using_filepath.write_text( f"{isettings.uid}\n{','.join(source_modules)}", encoding="utf-8" ) # access_db can take both: the dict from connect_instance_hub and isettings into_db_token = isettings target_modules = setup_settings.instance.modules if missing_members := source_modules - target_modules: logger.info( f"in transfer, source lamindb instance has additional modules: {', '.join(missing_members)}" ) add_db_connection(db, instance) if is_fine_grained_access: db_token = DBToken(into_db_token) db_token_manager.set(db_token, instance) return QuerySet(model=cls, using=instance) def __get_module_name__(cls) -> str: schema_module_name = cls.__module__.split(".")[0] module_name = schema_module_name.replace("lnschema_", "") if module_name == "lamindb": module_name = "core" return module_name def __get_name_with_module__(cls) -> str: module_name = cls.__get_module_name__() if module_name == "core": module_prefix = "" else: module_prefix = f"{module_name}." return f"{module_prefix}{cls.__name__}" def __get_available_fields__(cls) -> set[str]: if cls._available_fields is None: available_fields = set() for field in cls._meta.get_fields(): if not (field_name := field.name).startswith(("_", "links_")): available_fields.add(field_name) if isinstance(field, django_ForeignKey): available_fields.add(field_name + "_id") if cls.__name__ == "Artifact": available_fields.add("transform") available_fields.add("feature_sets") # backward compat with lamindb v1 cls._available_fields = available_fields return cls._available_fields class BaseSQLRecord(models.Model, metaclass=Registry): """Base SQL metadata record. It provides methods to `SQLRecord` and all its subclasses, but doesn't come with the additional `branch` and `space` fields. """ objects = QueryManager() class Meta: abstract = True base_manager_name = "objects" # fields to track for changes # if not None, will be tracked in self._original_values as {field_name: value} # use _id fields for foreign keys _TRACK_FIELDS: tuple[str, ...] | None = None def __init__(self, *args, **kwargs): skip_validation = kwargs.pop("_skip_validation", False) if not args: if not os.getenv("LAMINDB_MULTI_INSTANCE") == "true": if ( issubclass(self.__class__, SQLRecord) and self.__class__.__name__ != "Storage" # do not save bionty entities in restricted spaces by default and self.__class__.__module__ != "bionty.models" ): from lamindb import context as run_context if run_context.space is not None: current_space = run_context.space elif setup_settings.space is not None: current_space = setup_settings.space if current_space is not None: if "space_id" in kwargs: # space_id takes precedence over space # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e if kwargs["space_id"] == 1: # ignore default space kwargs.pop("space_id") kwargs["space"] = current_space elif "space" in kwargs: if kwargs["space"] is None: kwargs["space"] = current_space else: kwargs["space"] = current_space if _is_branch_sensitive_model(self.__class__): from lamindb import context as run_context if run_context.branch is not None: current_branch = run_context.branch elif setup_settings.branch is not None: current_branch = setup_settings.branch if current_branch is not None: # branch_id takes precedence over branch # https://claude.ai/share/f045e5dc-0143-4bc5-b8a4-38309229f75e if "branch_id" in kwargs: if kwargs["branch_id"] == 1: # ignore default branch kwargs.pop("branch_id") kwargs["branch"] = current_branch elif "branch" in kwargs: if kwargs["branch"] is None: kwargs["branch"] = current_branch else: kwargs["branch"] = current_branch kwargs["created_on"] = kwargs["branch"] if skip_validation: super().__init__(**kwargs) else: from ..core._settings import settings from .can_curate import CanCurate from .collection import Collection from .transform import Transform validate_fields(self, kwargs) # do not search for names if an id is passed; this is important # e.g. when synching ids from the notebook store to lamindb has_consciously_provided_uid = False if "_has_consciously_provided_uid" in kwargs: has_consciously_provided_uid = kwargs.pop( "_has_consciously_provided_uid" ) if ( isinstance(self, (CanCurate, Collection, Transform)) and settings.creation.search_names and not has_consciously_provided_uid ): name_field = getattr(self, "_name_field", "name") exact_match = suggest_records_with_similar_names( self, name_field, kwargs ) if exact_match is not None: if "version_tag" in kwargs: if kwargs.get("version_tag") is not None: version_comment = " and version" existing_record = self.__class__.filter( **{ name_field: kwargs[name_field], "version_tag": kwargs.get("version_tag"), } ).one_or_none() else: # for a versioned record, an exact name match is not a criterion # for retrieving a record in case `version` isn't passed - # we'd always pull out many records with exactly the same name existing_record = None else: version_comment = "" existing_record = exact_match if existing_record is not None: logger.important( f"returning {self.__class__.__name__.lower()} with same" f" {name_field}{version_comment}: '{kwargs[name_field]}'" ) init_self_from_db(self, existing_record) update_attributes(self, kwargs) # track original values after replacing with the existing record self._populate_tracked_fields() return None super().__init__(**kwargs) if isinstance(self, ValidateFields): # this will trigger validation against django validators try: if hasattr(self, "clean_fields"): self.clean_fields() else: self._Model__clean_fields() except DjangoValidationError as e: message = _format_django_validation_error(self, e) raise FieldValidationError(message) from e elif len(args) != len(self._meta.concrete_fields): raise FieldValidationError( f"Use keyword arguments instead of positional arguments, e.g.: {self.__class__.__name__}(name='...')." ) else: super().__init__(*args) # track original values of fields that are tracked for changes self._populate_tracked_fields() # TODO: refactor to use _TRACK_FIELDS track_current_name_value(self) # used in __init__ # populates the _original_values dictionary with the original values of the tracked fields def _populate_tracked_fields(self): if (track_fields := self._TRACK_FIELDS) is not None: concrete_attnames = {f.attname for f in self._meta.concrete_fields} self._original_values = {} for field_name in track_fields: if field_name not in concrete_attnames: raise FieldValidationError( f"_TRACK_FIELDS contains invalid field for {self.__class__.__name__}: {field_name}" ) # deferred model loading (e.g. .only("id") or certain fetching methods during deletion) # can omit tracked fields from __dict__; # use .get(..., DEFERRED) to avoid KeyError and to show that the field is not loaded yet. self._original_values[field_name] = self.__dict__.get( field_name, DEFERRED ) else: self._original_values = None def _field_changed(self, field_name: str, check_is_saved: bool = True) -> bool: """Check if the field has changed since the record was saved.""" # use _id fields for foreign keys in field_name if check_is_saved and self._state.adding: return False # check if the field is tracked for changes track_fields = self._TRACK_FIELDS assert track_fields is not None, ( "_TRACK_FIELDS must be set for the record to track changes" ) assert field_name in track_fields, ( f"Field {field_name} is not tracked for changes" ) # check if the field has changed since the record was created original_value = self._original_values.get(field_name, DEFERRED) if original_value is DEFERRED: return False current_value = self.__dict__.get(field_name, DEFERRED) if current_value is DEFERRED: return False return original_value != current_value def save(self: T, *args, **kwargs) -> T: """Save. Always saves to the default database. """ using_key = None if "using" in kwargs: using_key = kwargs["using"] transfer_config = kwargs.pop("transfer", None) db = self._state.db pk_on_db = self.pk artifacts: list = [] if self.__class__.__name__ == "Collection" and self.id is not None: # when creating a new collection without being able to access artifacts artifacts = self.ordered_artifacts.to_list() pre_existing_record = None # consider records that are being transferred from other databases transfer_logs: dict[str, list[str]] = { "mapped": [], "transferred": [], "run": None, } if db is not None and db != "default" and using_key is None: if isinstance(self, IsVersioned): if not self.is_latest: raise NotImplementedError( "You are attempting to transfer a record that's not the latest in its version history. This is currently not supported." ) pre_existing_record = transfer_to_default_db( self, using_key, transfer_logs=transfer_logs ) self._revises: IsVersioned if pre_existing_record is not None: init_self_from_db(self, pre_existing_record) else: # TODO: refactor to use _TRACK_FIELDS check_name_change(self) try: # save versioned record in presence of self._revises if isinstance(self, IsVersioned) and self._revises is not None: revises = self._revises with transaction.atomic(): # For branch-aware models (SQLRecord), keep source-branch latest # intact and only demote within the same branch. For other # versioned models (e.g. blocks), keep previous behavior. should_demote = True if hasattr(revises, "branch_id") and hasattr(self, "branch_id"): should_demote = revises.branch_id == self.branch_id if should_demote: assert revises.is_latest # noqa: S101 revises.is_latest = False revises._revises = None # ensure we don't start a recursion revises.save() super().save(*args, **kwargs) # type: ignore self._revises = None # save unversioned record else: super().save(*args, **kwargs) except (IntegrityError, ProgrammingError) as e: error_msg = str(e) # error for hash/uid duplication if ( self.__class__.__name__ in {"Transform", "Artifact", "Collection"} and isinstance(e, IntegrityError) and "hash" in error_msg and unique_constraint_error_in_error_message(error_msg) ): # we also need to include the key here because hash can be the same across keys query_fields = {"hash": self.hash, "key": self.key} if self.__class__.__name__ == "Artifact": # in case of artifact, also storage is needed query_fields["storage"] = self.storage # the get here is Django's get and not aware of the trash or other branches # but generally we bypass branch_id in queries for hash also in LaminDB's get() pre_existing_record = self.__class__.get(**query_fields) from_trash = ( "from trash" if pre_existing_record.branch_id == -1 else "" ) pre_existing_record.branch_id = 1 # move to default branch logger.warning( f"returning {self.__class__.__name__.lower()} {from_trash} with same hash & key: {pre_existing_record}" ) init_self_from_db(self, pre_existing_record) elif ( isinstance(e, IntegrityError) # for Storage, even if uid was in the error message, we can retrieve based on # the root because it's going to be the same root and any(field in error_msg for field in UNIQUE_FIELD_NAMES) and ( "_type_name_at_" not in error_msg ) # constraints for unique type names in Record, ULabel, etc. and ( "UNIQUE constraint failed" in error_msg or "duplicate key value violates unique constraint" in error_msg ) and hasattr(self, "branch_id") ): unique_fields = parse_violated_field_from_error_message(error_msg) # here we query against the all branches with .objects pre_existing_record = self.__class__.objects.get( **{field: getattr(self, field) for field in unique_fields} ) # if the existing record is in the default branch, we just return it if pre_existing_record.branch_id == 1: logger.warning( f"returning {self.__class__.__name__} record with same {unique_fields}: '{ {field: getattr(self, field) for field in unique_fields} }'" ) # if the existing record is in a different branch we update its fields else: # modifies the fields of the existing record with new values of self field_names = [i.name for i in self.__class__._meta.fields] update_attributes( pre_existing_record, {f: getattr(self, f) for f in field_names}, ) pre_existing_record.save() init_self_from_db(self, pre_existing_record) elif ( isinstance(e, ProgrammingError) and "new row violates row-level security policy" in error_msg and ( (is_locked := getattr(self, "is_locked", False)) or hasattr(self, "space") ) ): if is_locked: no_write_msg = "It is not allowed to modify or create locked ('is_locked=True') records." else: no_write_msg = ( f"You're not allowed to write to the space '{self.space.name}'.\n" "Please contact administrators of the space if you need write access." ) raise NoWriteAccess(no_write_msg) from None elif ( isinstance(e, ProgrammingError) and "permission denied for table" in error_msg and (isettings := setup_settings.instance)._db_permissions == "public" ): slug = isettings.slug raise NoWriteAccess( f"You are trying to write to '{slug}' with public (read-only) permissions.\n" "Please contact administrators to make you a collaborator if you need write access.\n" f"If you are already a collaborator, please do 'lamin connect {slug}' in console, " "restart the python session and try again." ) from None else: raise # call the below in case a user makes more updates to the record track_current_name_value(self) # perform transfer of many-to-many fields # only supported for Artifact and Collection records if db is not None and db != "default" and using_key is None: if self.__class__.__name__ == "Collection": if len(artifacts) > 0: logger.info("transfer artifacts") for artifact in artifacts: artifact.save() self.artifacts.add(*artifacts) if hasattr(self, "labels") and transfer_config == "annotations": from copy import copy # here we go back to original record on the source database self_on_db = copy(self) self_on_db._state.db = db self_on_db.pk = pk_on_db # manually set the primary key self.features._add_from(self_on_db, transfer_logs=transfer_logs) self.labels.add_from(self_on_db, transfer_logs=transfer_logs) for k, v in transfer_logs.items(): if k != "run" and len(v) > 0: logger.important(f"{k}: {', '.join(v)}") if self.__class__.__name__ in { "Artifact", "Transform", "Run", "ULabel", "Feature", "Schema", "Collection", "Reference", } and not ( self.__class__.__name__ == "Artifact" and self.kind == "__lamindb_run__" ): import lamindb as ln if ln.context.project is not None: self.projects.add(ln.context.project) return self @class_and_instance_method def describe( cls_or_self, return_str: bool = False, include: None | Literal["comments"] = None, ) -> None | str: """Describe record including relations. Args: return_str: Return a string instead of printing. include: Include additional content. Use ``"comments"`` to display readme and comment blocks. """ from ._describe import describe_postgres_sqlite if isinstance(cls_or_self, type): return type(cls_or_self).describe(cls_or_self, return_str=return_str) # type: ignore else: return describe_postgres_sqlite( cls_or_self, return_str=return_str, include=include ) def __repr__( self: SQLRecord, include_foreign_keys: bool = True, exclude_field_names: list[str] | None = None, ) -> str: if exclude_field_names is None: exclude_field_names = ["id", "updated_at", "source_code"] field_names = [ field.name for field in self._meta.fields if ( not isinstance(field, ForeignKey) and field.name not in exclude_field_names ) ] if include_foreign_keys: field_names += [ f"{field.name}_id" for field in self._meta.fields if isinstance(field, ForeignKey) ] # TODO: harmonize with L426 in query_set.py if "created_at" in field_names: field_names.remove("created_at") field_names.append("created_at") if "is_locked" in field_names: field_names.remove("is_locked") field_names.append("is_locked") if "created_on" in field_names: field_names.remove("created_on") field_names.append("created_on") if "version_tag" in field_names: field_names.remove("version_tag") field_names.append("version_tag") if "is_latest" in field_names: field_names.remove("is_latest") field_names.append("is_latest") if field_names[0] != "uid" and "uid" in field_names: field_names.remove("uid") field_names.insert(0, "uid") fields_str = {} for k in field_names: if k == "n" and getattr(self, k) < 0: # only needed for Schema continue if ( not k.startswith("_") or (k == "_dtype_str" and self.__class__.__name__ == "Feature") ) and hasattr(self, k): value = getattr(self, k) # Force strip the time component of the version if k == "version" and value: fields_str[k] = f"'{str(value).split()[0]}'" else: fields_str[k] = format_field_value(value) fields_joined_str = ", ".join( [f"{k}={fields_str[k]}" for k in fields_str if fields_str[k] is not None] ) return f"{self.__class__.__name__}({fields_joined_str})" def __str__(self) -> str: return self.__repr__() def delete(self, permanent: bool | None = None): """Delete. Args: permanent: For consistency, `False` raises an error, as soft delete is impossible. Returns: When `permanent=True`, returns Django's delete return value: a tuple of (deleted_count, {registry_name: count}). Otherwise returns None. """ if permanent is False: raise ValueError( f"Soft delete is not possible for {self.__class__.__name__}, " "use 'permanent=True' or 'permanent=None' for permanent deletion." ) return delete_record(self, is_soft=False) class Space(BaseSQLRecord): """Spaces with managed access for specific users or teams. If not setting a space, a :class:`~lamindb.models.SQLRecord` object is accessible to all collaborators of the LaminDB instance because its :attr:`~lamindb.models.SQLRecord.space` field defaults to the built-in `all` space. You can create a restricted space through LaminHub either on the instance settings page or the *Spaces* tab of your account page. Examples: After creating a restricted space through LaminHub, create an artifact in the space:: space = ln.Space.get(name="Our space") # get a space ln.Artifact("./test.txt", key="test.txt", space=space).save() # save artifact in space You can also move an existing object into a space:: space = ln.Space.get(name="Our space") # select a space record = ln.Record.get(name="existing label") record.space = space record.save() # saved in space "Our space" For more examples and background, see :doc:`docs:permissions`, in particular, section :ref:`docs:use-a-restricted-space`. Notes: All data in this registry is synchronized from LaminHub so that spaces can be shared and reused across multiple LaminDB instances. """ class Meta: app_label = "lamindb" constraints = [ models.UniqueConstraint(Lower("name"), name="unique_space_name_lower") ] id: int = models.SmallAutoField(primary_key=True) """Internal id, valid only in one DB instance.""" name: str = models.CharField(max_length=100, db_index=True) """Name of space.""" uid: str = CharField( editable=False, unique=True, max_length=12, default=base62_12, db_index=True, ) """Universal id.""" description: str | None = TextField(null=True) """Description of space.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "User", CASCADE, default=None, related_name="+", null=True ) """Creator of space.""" ablocks: RelatedManager[SpaceBlock] """Attached blocks ← :attr:`~lamindb.SpaceBlock.space`.""" @overload def __init__( self, name: str, description: str | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if not args and "uid" not in kwargs: warn = False msg = "" isettings = setup_settings.instance if (dialect := isettings.dialect) != "postgresql": warn = True msg = f"on {dialect} databases" elif not isettings.is_on_hub: warn = True msg = "on local instances" if warn: logger.warning( f"creating spaces manually {msg} is possible for demo purposes, " "but does *not* affect access permissions" ) super().__init__(*args, **kwargs) class Branch(BaseSQLRecord): """Branches for change management with archive and trash states. .. dropdown:: The 3 built-in branches: `main`, `trash` & `archive` The `main` branch acts as the default branch. The `trash` branch acts like a trash bin on a file system. It you delete a `SQLRecord` object via `.delete()`, it gets moved onto the `trash` branch and scheduled for deletion. The `archive` acts like an archive that hides objects from queries and searches without scheduling them for deletion. To move an object into the archive, run: `obj.branch_id = 0; obj.save()`. Args: name: A unique name. When lower-cased, is constrained to be unique across all branches. description: A description. Examples: To create a contribution branch and switch to it, run:: lamin switch -c my_branch To merge a contribution branch into `main`, run:: lamin switch main # switch to the main branch lamin merge my_branch # merge contribution branch into main To see the current branch along with other information, run:: lamin info To annotate the current branch with a `README.md`, run:: lamin annotate branch --readme README.md To comment on the current branch, run:: lamin annotate branch --comment "I think we should revisit this, tomorrow, WDYT?" To describe the current branch (optionally include comments), run:: lamin describe branch --include comments To trace on which branch a `SQLRecord` object was created, run:: sqlrecord.created_on.describe() To open a Change Request for a branch, run: .. tab-set:: .. tab-item:: CLI .. code-block:: bash lamin update branch --status draft # for current branch lamin update branch --name my_branch --status review # for any branch .. tab-item:: Python .. code-block:: python branch = ln.Branch.get(name="my_branch") branch.status = "draft" branch.save() branch.status = "review" branch.save() Just like Pull Requests on GitHub, branches are never deleted so that the provenance of a change stays traceable. .. dropdown:: Managing `is_latest` during branching `is_latest` is branch-aware during development and reconciled on merge. - Creating a new version on a contribution branch keeps the previous version on `main` as `is_latest=True`. - After `lamin merge`, only one object per version family remains with `is_latest=True` in the target branch. - If both source and target branches have `is_latest=True`, the merged branch keeps the newest object by `created_at`. Example flow:: # before merge # main: v1.is_latest=True # contribution branch: v2(revises=v1).is_latest=True lamin switch main lamin merge my_branch # after merge on main: v2.is_latest=True, v1.is_latest=False .. dropdown:: Logical vs. physical branching LaminDB uses **logical branching** via `SQLRecord`'s `.branch` field, treating `branch` like any other field during queries & tracing, and keeping infrastructure simple and platform-agnostic. However, it doesn't allow isolating SQL `UPDATE` statements on a branch (only their corresponding `DbWrite` events). Here are some notable alternatives: - Some Postgres platforms like Supabase or Neon, by contrast, provide physical branching through cloning entire databases. This allows for isolated SQL `UPDATE` statements but creates separate, disconnected environments and much overhead. - Project Nessie is a versioned catalog for data lakes that tracks file states. LaminDB is analogous to Nessie in that it also treats branching on the metadata catalog level (considering LaminDB's SQL database as the metadata catalog). - Dolt is a specialized database engine that provides storage-level branching. It allows branch isolation and merging at the engine level. While powerful, it requires using the Dolt database itself. Why logical branching? Data science and ML workflows are primarily append-only. Because a "change" usually results in a new version of an artifact, transform, or collection or new runs or other new objects rather than an in-place modification, the row-level `branch` field provides isolation for 99% of use cases. This avoids the technical complexity of row duplication, preserves database integrity, and allows the `is_latest` logic to reconcile versions globally upon merge. """ class Meta: app_label = "lamindb" constraints = [ models.UniqueConstraint(Lower("name"), name="unique_branch_name_lower") ] # below isn't fully implemented but a roadmap # - 3: template (hidden in queries & searches) # - 2: locked (same as default, but locked for edits except for space admins) # - 1: default (visible in queries & searches) # - 0: archive (hidden, meant to be kept, locked for edits for everyone) # - -1: trash (hidden, scheduled for deletion) # An integer higher than >3 codes a branch that can be used for collaborators to create drafts # that can be merged onto the main branch in an experience akin to a Pull Request. The mapping # onto a semantic branch name is handled through LaminHub. id: int = models.AutoField(primary_key=True) """An integer id that's synchronized for a family of coupled database instances. Among all LaminDB instances, this id is arbitrary and non-unique. """ name: str = models.CharField(max_length=100, db_index=True) """Name of branch.""" uid: str = CharField( editable=False, unique=True, max_length=12, default=base62_12, db_index=True, ) """Universal id. This id is useful if one wants to apply the same patch to many database instances. """ space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name="+") """The space associated with the branch.""" description: str | None = TextField(null=True) """Description of branch.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "User", PROTECT, default=current_user_id, related_name="+" ) """Creator of branch.""" _status_code: int = models.SmallIntegerField(default=0, db_default=0, db_index=True) """Status code. -2: closed; -1: merged; 0: standalone; 1: draft; 2: review.""" _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True) """Auxiliary field for dictionary-like metadata.""" ablocks: RelatedManager[BranchBlock] """Attached blocks ← :attr:`~lamindb.BranchBlock.branch`.""" users: RelatedManager[User] = models.ManyToManyField( "User", through="BranchUser", related_name="branches", ) """Users linked to this branch (e.g. reviewers) ← :attr:`~lamindb.User.branches`.""" ulabels: RelatedManager[ULabel] = models.ManyToManyField( "ULabel", through="BranchULabel", related_name="branches", ) """ULabels annotating this branch ← :attr:`~lamindb.BranchULabel.ulabel`.""" projects: RelatedManager[Project] = models.ManyToManyField( "Project", through="BranchProject", related_name="branches", ) """Projects annotating this branch ← :attr:`~lamindb.BranchProject.project`.""" @property def status(self) -> BranchStatus: """Branch status. Get and set the status of the branch. ============= ===== ================================================== status code description ============= ===== ================================================== `closed` -2 Change Request was closed without merging. `merged` -1 The branch was merged into another branch. `standalone` 0 A standalone branch without Change Request. `draft` 1 Change Request exists but is not ready for review. `review` 2 Change Request is ready for review. ============= ===== ================================================== The database stores the branch status as an integer code in field `_status_code`. Example: See the status of a branch:: branch.status #> 'standalone' Open a Change Request in draft state:: branch.status = "draft" branch.save() Request review for the Change Request:: branch.status = "review" branch.save() Query by status:: ln.Branch.filter(status="merged").to_dataframe() """ return BRANCH_CODE_TO_STATUS.get(self._status_code, "standalone") @status.setter def status(self, value: BranchStatus) -> None: if value not in BRANCH_STATUS_TO_CODE: raise ValueError( "Invalid branch status. Expected one of: " "'standalone', 'draft', 'review', 'merged', 'closed'." ) self._status_code = BRANCH_STATUS_TO_CODE[value] @overload def __init__( self, name: str, description: str | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): super().__init__(*args, **kwargs) class BranchUser(BaseSQLRecord, IsLink): class Meta: app_label = "lamindb" unique_together = ("branch", "user", "role") id: int = models.BigAutoField(primary_key=True) branch: Branch = ForeignKey(Branch, CASCADE, related_name="links_user") user: User = ForeignKey("User", PROTECT, related_name="links_branch") role: str = CharField(max_length=32, db_index=True) @doc_args(RECORD_REGISTRY_EXAMPLE) class SQLRecord(BaseSQLRecord, metaclass=Registry): """An object that maps to a row in a SQL table in the database. For the inherited `SQLRecord` class method definitions, see :class:`~lamindb.models.BaseSQLRecord`. Every `SQLRecord` is a data model that comes with a registry in form of a SQL table in your database. Sub-classing `SQLRecord` creates a new registry while instantiating a `SQLRecord` creates a new object. {} `SQLRecord`'s metaclass is :class:`~lamindb.models.Registry`. `SQLRecord` inherits from Django's `Model` class. Why does LaminDB call it `SQLRecord` and not `Model`? The term `SQLRecord` can't lead to confusion with statistical, machine learning or biological models. """ # we need the db_default when not interacting via django directly on a required field branch: Branch = ForeignKey( Branch, PROTECT, default=1, db_default=1, related_name="+", ) """The current branch of the object - changes e.g. on merge events.""" created_on: Branch = ForeignKey( Branch, PROTECT, default=1, db_default=1, related_name="+", ) """The branch on which this object was created - never changes.""" space: Space = ForeignKey(Space, PROTECT, default=1, db_default=1, related_name="+") """The space.""" is_locked: bool = BooleanField(default=False, db_default=False) """Whether the object is locked for edits.""" _aux: dict[str, Any] | None = JSONField(default=None, db_default=None, null=True) """Auxiliary field for dictionary-like metadata.""" class Meta: abstract = True def restore(self) -> None: """Restore from trash onto the main branch. Does **not** restore descendant objects if the object is `HasType` with `is_type = True`. """ self.branch_id = 1 self.save() def delete(self, permanent: bool | None = None, **kwargs): """Delete object. If object is `HasType` with `is_type = True`, deletes all descendant objects, too. Args: permanent: Whether to permanently delete the object (skips trash). If `None`, performs soft delete if the object is not already in the trash. Returns: When `permanent=True`, returns Django's delete return value: a tuple of (deleted_count, {registry_name: count}). Otherwise returns None. Examples: For any `SQLRecord` object `sqlrecord`, call:: sqlrecord.delete() """ if self._state.adding: logger.warning("record is not yet saved, delete has no effect") return None name_with_module = self.__class__.__get_name_with_module__() if name_with_module == "Artifact": # this first check means an invalid delete fails fast rather than cascading through # database and storage permission errors isettings = setup_settings.instance if self.storage.instance_uid != isettings.uid and ( kwargs["storage"] or kwargs["storage"] is None ): from ..errors import IntegrityError from .storage import Storage raise IntegrityError( "Cannot simply delete artifacts outside of this instance's managed storage locations." "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`" f"\n(2) If you want to delete the artifact in storage, please connect to the writing lamindb instance (uid={self.storage.instance_uid})." f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).to_dataframe()}" ) # change branch_id to trash trash_branch_id = -1 if self.branch_id > trash_branch_id and permanent is not True: if isinstance(self, HasType) and self.is_type: for child in getattr( self, f"query_{self.__class__.__name__.lower()}s" )(): child.delete() delete_record(self, is_soft=True) logger.important(f"moved record to trash: {self}") return None # permanent delete if permanent is None: object_type_name = self.__class__.__name__ log_identifier = self.uid if hasattr(self, "uid") else self.pk response = input( f"{object_type_name} {log_identifier} is already in trash! Are you sure you want to delete it from your" " database? You can't undo this action. (y/n) " ) confirm_delete = response == "y" else: confirm_delete = permanent if confirm_delete: if name_with_module == "Run": from .run import _permanent_delete_runs _permanent_delete_runs(self) return None if name_with_module == "Transform": from .transform import _permanent_delete_transforms _permanent_delete_transforms(self) return None if name_with_module == "Artifact": from .artifact import delete_permanently delete_permanently( self, storage=kwargs["storage"], using_key=kwargs["using_key"] ) return None return super().delete() return None def _format_django_validation_error(record: SQLRecord, e: DjangoValidationError): """Pretty print Django validation errors.""" errors = {} if hasattr(e, "error_dict"): error_dict = e.error_dict else: error_dict = {"__all__": e.error_list} for field_name, error_list in error_dict.items(): for error in error_list: if hasattr(error, "message"): msg = error.message else: msg = str(error) if field_name == "__all__": errors[field_name] = f"{colors.yellow(msg)}" else: current_value = getattr(record, field_name, None) errors[field_name] = ( f"{field_name}: {colors.yellow(current_value)} is not valid\n → {msg}" ) if errors: message = "\n " for _, error in errors.items(): message += error + "\n " return message def _get_record_kwargs(record_class) -> list[tuple[str, str]]: """Gets the parameters of a SQLRecord from the overloaded signature. Example: >>> get_record_params(bt.Organism) >>> [('name', 'str'), ('taxon_id', 'str | None'), ('scientific_name', 'str | None')] """ source = inspect.getsource(record_class) # Find first overload that's not *db_args pattern = r"@overload\s+def __init__\s*\(([\s\S]*?)\):\s*\.{3}" overloads = re.finditer(pattern, source) for single_overload in overloads: params_block = single_overload.group(1) # This is an additional safety measure if the overloaded signature that we're # looking for is not at the top but a "db_args" constructor if "*db_args" in params_block: continue params = [] for line in params_block.split("\n"): line = line.strip() if not line or "self" in line: continue # Extract name and type annotation # The regex pattern finds parameter definitions like: # Simple: name: str # With default: age: int = 0 # With complex types: items: List[str] = [] param_pattern = ( r"(\w+)" # Parameter name r"\s*:\s*" # Colon with optional whitespace r"((?:[^=,]|" # Type hint: either non-equals/comma chars r"(?<=\[)[^[\]]*" # or contents within square brackets r"(?=\]))+)" # looking ahead for closing bracket r"(?:\s*=\s*" # Optional default value part r"([^,]+))?" # Default value: anything but comma ) match = re.match(param_pattern, line) if not match: continue name, type_str = match.group(1), match.group(2).strip() # Keep type as string instead of evaluating params.append((name, type_str)) return params return [] def get_name_field( registry: type[SQLRecord] | QuerySet | Manager, *, field: StrField | None = None, ) -> str: """Get the 1st char or text field from the registry.""" if isinstance(registry, (QuerySet, Manager)): registry = registry.model model_field_names = [i.name for i in registry._meta.fields] # set to default name field if field is None: if hasattr(registry, "_name_field"): field = registry._meta.get_field(registry._name_field) elif "name" in model_field_names: field = registry._meta.get_field("name") else: # first char or text field that doesn't contain "id" for i in registry._meta.fields: if "id" in i.name: continue if i.get_internal_type() in {"CharField", "TextField"}: field = i break # no default name field can be found if field is None: raise ValueError( f"Do not know which field to use as name file for registry {registry}, please pass field" ) else: field = field.name # type:ignore if not isinstance(field, str): try: field = field.field.name except AttributeError: raise TypeError( "please pass a SQLRecord string field, e.g., `CellType.name`!" ) from None return field def add_db_connection(db: str, using: str): db_config = dj_database_url.config( default=db, conn_max_age=600, conn_health_checks=True ) db_config["TIME_ZONE"] = "UTC" db_config["OPTIONS"] = {} db_config["AUTOCOMMIT"] = True connections.settings[using] = db_config REGISTRY_UNIQUE_FIELD = {"storage": "root", "ulabel": "name"} def update_fk_to_default_db( records: SQLRecord | list[SQLRecord] | QuerySet, fk: str, using_key: str | None, transfer_logs: dict, ): # here in case it is an iterable, we are checking only a single record # and set the same fks for all other records because we do this only # for certain fks where they have to the same for the whole bulk # see transfer_fk_to_default_db_bulk # todo: but this has to be changed i think, it is not safe as it is now - Sergei record = records[0] if isinstance(records, (list, QuerySet)) else records if getattr(record, f"{fk}_id", None) is not None: # set the space of the transferred record to the current space if fk == "space": # for space we set the record's space to the current space from lamindb import context # the default space has id=1 fk_record_default = Space.get(1) if context.space is None else context.space # process non-space fks else: fk_record = getattr(record, fk) field = REGISTRY_UNIQUE_FIELD.get(fk, "uid") fk_record_default = fk_record.__class__.filter( **{field: getattr(fk_record, field)} ).one_or_none() if fk_record_default is None: from copy import copy fk_record_default = copy(fk_record) transfer_to_default_db( fk_record_default, using_key, save=True, transfer_logs=transfer_logs ) # re-set the fks to the newly saved ones in the default db if isinstance(records, (list, QuerySet)): for r in records: setattr(r, f"{fk}", None) setattr(r, f"{fk}_id", fk_record_default.id) else: setattr(records, f"{fk}", None) setattr(records, f"{fk}_id", fk_record_default.id) FKBULK = [ "organism", "source", "report", # Run ] def transfer_fk_to_default_db_bulk( records: list | QuerySet, using_key: str | None, transfer_logs: dict ): for fk in FKBULK: update_fk_to_default_db(records, fk, using_key, transfer_logs=transfer_logs) def get_transfer_run(record) -> Run: from lamindb import settings from lamindb.core._context import context from lamindb.models import Run, Transform from lamindb.models.artifact import WARNING_RUN_TRANSFORM slug = record._state.db owner, name = get_owner_name_from_identifier(slug) cache_using_filepath = ( ln_setup.settings.cache_dir / f"instance--{owner}--{name}--uid.txt" ) if not cache_using_filepath.exists(): raise SystemExit("Need to call .connect() before") instance_uid = cache_using_filepath.read_text().split("\n")[0] # TODO: consider renaming to __lamindb_sync__ key = f"__lamindb_transfer__/{instance_uid}" uid = instance_uid + "0000" transform = Transform.filter(uid=uid).one_or_none() if transform is None: search_names = settings.creation.search_names settings.creation.search_names = False # TODO: consider renaming to "Sync from" transform = Transform( # type: ignore uid=uid, description=f"Transfer from `{slug}`", key=key, kind="function" ).save() settings.creation.search_names = search_names # use the global run context to get the initiated_by_run run id if context.run is not None: initiated_by_run = context.run else: if not settings.creation.artifact_silence_missing_run_warning: logger.warning(WARNING_RUN_TRANSFORM) initiated_by_run = None # it doesn't seem to make sense to create new runs for every transfer run = Run.filter(transform=transform, initiated_by_run=initiated_by_run).first() if run is None: run = Run(transform=transform, initiated_by_run=initiated_by_run).save() # type: ignore run.initiated_by_run = initiated_by_run # so that it's available in memory return run def transfer_to_default_db( record: SQLRecord, using_key: str | None, *, transfer_logs: dict, save: bool = False, transfer_fk: bool = True, ) -> SQLRecord | None: if record._state.db is None or record._state.db == "default": return None registry = record.__class__ logger.debug(f"transferring {registry.__name__} record {record.uid} to default db") record_on_default = registry.objects.filter(uid=record.uid).one_or_none() record_str = f"{record.__class__.__name__}(uid='{record.uid}')" if transfer_logs["run"] is None: transfer_logs["run"] = get_transfer_run(record) if record_on_default is not None: transfer_logs["mapped"].append(record_str) return record_on_default else: transfer_logs["transferred"].append(record_str) if hasattr(record, "created_by_id"): record.created_by = None record.created_by_id = ln_setup.settings.user.id # run & transform run = transfer_logs["run"] if hasattr(record, "run_id"): record.run = None record.run_id = run.id # deal with denormalized transform FK on artifact and collection if hasattr(record, "transform_id"): record.transform = None record.transform_id = run.transform_id # transfer other foreign key fields fk_fields = [ i.name for i in record._meta.fields if i.get_internal_type() == "ForeignKey" if i.name not in {"created_by", "run", "transform", "branch"} ] if not transfer_fk: # don't transfer fk fields that are already bulk transferred fk_fields = [fk for fk in fk_fields if fk not in FKBULK] for fk in fk_fields: update_fk_to_default_db(record, fk, using_key, transfer_logs=transfer_logs) record.id = None record._state.db = "default" if save: record.save() return None def track_current_name_value(record: SQLRecord): # below, we're using __dict__ to avoid triggering the refresh from the database # which can lead to a recursion if hasattr(record, "_name_field"): record._old_name = record.__dict__.get(record._name_field) def check_name_change(record: SQLRecord): """Warns if a record's name has changed.""" from lamindb.models import ( Artifact, Collection, Feature, Schema, Storage, Transform, ) if ( not record.pk or not hasattr(record, "_old_name") or not hasattr(record, "_name_field") ): return # key-like records are not checked here if isinstance(record, (Artifact, Collection, Transform)): return # renaming feature sets is not checked if isinstance(record, Schema): return old_name = record._old_name new_name = getattr(record, record._name_field) registry = record.__class__.__name__ if old_name != new_name: if hasattr(record, "artifacts") and not isinstance(record, Storage): linked_records = ( # find all artifacts that are linked to this label via a feature with dtype # matching on the name aka "[registry]" record.artifacts.through.filter( feature___dtype_str__contains=f"[{registry}]", **{f"{registry.lower()}_id": record.pk}, ) ) artifact_uids = list(set(linked_records.to_list("artifact__uid"))) n = len(artifact_uids) if n > 0: s = "s" if n > 1 else "" es = "es" if n == 1 else "" logger.error( f"by {colors.red('renaming label')} from '{old_name}' to '{new_name}' " f"{n} artifact{s} no longer match{es} the label name in storage: {artifact_uids}\n\n" f" → consider re-curating\n" ) elif isinstance(record, Feature): # only internal features of schemas with `itype=Feature` are prone to getting out of sync artifact_uids = Artifact.filter( schemas__features=record, schemas__itype="Feature" ).to_list("uid") n = len(artifact_uids) if n > 0: s = "s" if n > 1 else "" es = "es" if n == 1 else "" logger.warning( f"by {colors.red('renaming feature')} from '{old_name}' to '{new_name}' " f"{n} artifact{s} no longer match{es} the feature name in storage: {artifact_uids}\n" " → consider re-curating" ) def format_field_value(value: datetime | str | Any, none: str = "None") -> str: from datetime import datetime if isinstance(value, datetime): return value.strftime("%Y-%m-%d %H:%M:%S %Z") if isinstance(value, str): try: value = datetime.fromisoformat(value) value = value.strftime("%Y-%m-%d %H:%M:%S %Z") except ValueError: pass return f"'{value}'" if value is None: return none return str(value) class SQLRecordInfo: def __init__(self, registry: Registry): self.registry = registry def _get_type_for_field(self, field_name: str) -> str: field = self.registry._meta.get_field(field_name) related_model_name = ( field.related_model.__name__ if hasattr(field, "related_model") and field.related_model else None ) return related_model_name if related_model_name else field.get_internal_type() def _get_base_class_fields(self) -> list[str]: return [ field.name for base in self.registry.__bases__ if hasattr(base, "_meta") for field in base._meta.get_fields() ] def _reorder_fields_by_class(self, fields_to_order: list[Field]) -> list[Field]: """Reorders the fields so that base class fields come last.""" non_base_class_fields = [ field for field in fields_to_order if field.name not in self._get_base_class_fields() ] found_base_class_fields = [ field for field in fields_to_order if field.name in self._get_base_class_fields() ] return non_base_class_fields + found_base_class_fields def get_simple_fields(self, return_str: bool = False) -> Any: simple_fields = [ field for field in self.registry._meta.get_fields() if not ( isinstance(field, ManyToOneRel) or isinstance(field, ManyToManyRel) or isinstance(field, ManyToManyField) or isinstance(field, ForeignKey) or field.name.startswith("_") or field.name == "id" ) ] simple_fields = self._reorder_fields_by_class(simple_fields) if not return_str: return simple_fields else: repr_str = f" {colors.italic('Simple fields')}\n" if simple_fields: repr_str += "".join( [ f" .{field_name.name}: {self._get_type_for_field(field_name.name)}\n" for field_name in simple_fields ] ) return repr_str def get_relational_fields(self, return_str: bool = False): # we ignore ManyToOneRel because it leads to so much clutter in the API # also note that our general guideline is to have related_name="+" # for ForeignKey fields relational_fields = (ManyToOneRel, ManyToManyRel, ManyToManyField, ForeignKey) class_specific_relational_fields = [ field for field in self.registry._meta.fields + self.registry._meta.many_to_many if isinstance(field, relational_fields) and not field.name.startswith(("links_", "_")) ] non_class_specific_relational_fields = [ field for field in self.registry._meta.get_fields() if isinstance(field, relational_fields) and not field.name.startswith(("links_", "_")) ] non_class_specific_relational_fields = self._reorder_fields_by_class( non_class_specific_relational_fields ) # Ensure that class specific fields (e.g. Artifact) come before non-class specific fields (e.g. collection) filtered_non_class_specific = [ field for field in non_class_specific_relational_fields if field not in class_specific_relational_fields ] ordered_relational_fields = ( class_specific_relational_fields + filtered_non_class_specific ) # For Record class, move linked_in fields to the end if self.registry.__name__ == "Record": regular_fields = [ f for f in ordered_relational_fields if not f.name.startswith(("linked_", "values_")) ] linked_fields = [ f for f in ordered_relational_fields if f.name.startswith("linked_") ] values_fields = [ f for f in ordered_relational_fields if f.name.startswith("values_") ] ordered_relational_fields = regular_fields + linked_fields + values_fields core_module_fields = [] external_modules_fields = [] for field in ordered_relational_fields: field_name = repr(field).split(": ")[1][:-1] if field_name.count(".") == 1 and "lamindb" not in field_name: external_modules_fields.append(field) else: core_module_fields.append(field) def _get_related_field_type(field) -> str: model_name = field.related_model.__get_name_with_module__() # Extract the class name (after the last dot if there's a module prefix) class_name = model_name.split(".")[-1] # Skip replacement for compound names like ArtifactBlock, FeatureBlock, etc. if class_name.endswith("Block"): # Return just the class name for Block types field_type = class_name else: field_type = ( model_name.replace( "Artifact", "" ).replace( # some fields have an unnecessary 'Artifact' in their name "Collection", "" ) # some fields have an unnecessary 'Collection' in their name ) return ( self._get_type_for_field(field.name) if not field_type.strip() else field_type ) core_module_fields_formatted = [ f" .{field.name}: {_get_related_field_type(field)}\n" for field in core_module_fields ] external_modules_fields_formatted = [ f" .{field.name}: {_get_related_field_type(field)}\n" for field in external_modules_fields ] if not return_str: external_modules_fields_by_modules = defaultdict(list) for field_str, field in zip( external_modules_fields_formatted, external_modules_fields ): field_type = field_str.split(":")[1].split()[0] module_name = field_type.split(".")[0] external_modules_fields_by_modules[module_name].append(field) return core_module_fields, external_modules_fields_by_modules else: repr_str = "" # Non-external relational fields if core_module_fields: repr_str += f" {colors.italic('Relational fields')}\n" repr_str += "".join(core_module_fields_formatted) # External relational fields external_modules = set() for field in external_modules_fields_formatted: field_type = field.split(":")[1].split()[0] external_modules.add(field_type.split(".")[0]) if external_modules: # We want Bionty to show up before other modules external_modules = ( ["bionty"] + sorted(external_modules - {"bionty"}) # type: ignore if "bionty" in external_modules else sorted(external_modules) ) for ext_module in external_modules: ext_module_fields = [ field for field in external_modules_fields_formatted if ext_module in field ] if ext_module_fields: repr_str += ( f" {colors.italic(f'{ext_module.capitalize()} fields')}\n" ) repr_str += "".join(ext_module_fields) return repr_str class Migration(BaseSQLRecord): app = CharField(max_length=255) name = CharField(max_length=255) applied: datetime = DateTimeField() class Meta: db_table = "django_migrations" app_label = "lamindb" managed = False LinkORM = IsLink # backward compat Record = SQLRecord # backward compat BasicRecord = BaseSQLRecord # backward compat RecordInfo = SQLRecordInfo # backward compat ================================================ FILE: lamindb/models/storage.py ================================================ from __future__ import annotations from typing import ( TYPE_CHECKING, overload, ) from uuid import UUID from django.db import models from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core._hub_core import ( delete_storage_record, get_storage_records_for_instance, select_space, update_storage_with_space, ) from lamindb_setup.core._settings_storage import ( StorageSettings, get_storage_type, init_storage, ) from lamindb_setup.core.upath import check_storage_is_empty, create_path from lamindb.base.fields import ( CharField, TextField, ) from ..base.uids import base62_12 from .run import TracksRun, TracksUpdates from .sqlrecord import Space, SQLRecord if TYPE_CHECKING: from lamindb_setup.types import StorageType from upath import UPath from .artifact import Artifact class Storage(SQLRecord, TracksRun, TracksUpdates): """Storage locations of artifacts such as local directories or S3 buckets. A storage location is either a directory (local or a folder in the cloud) or an entire S3/GCP bucket. A storage location is written to by at most one LaminDB instance: the location’s *managing instance*. Some locations are not managed with LaminDB and, hence, do not have a managing instance. .. dropdown:: Writable vs. read-only storage locations The `instance_uid` field of `Storage` defines its *managing instance*. Only if a storage location's `instance_uid` matches your current instance's `uid` (`ln.settings.instance_uid`), you can write to it. All other storage locations are read-only in your current instance. Here is an example (`source `__). .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/eHDmIOAxLEoqZ2oK0000.png :width: 400px Some storage locations are not managed by any LaminDB instance, hence, their `instance_uid` is `None`. .. dropdown:: Managing access to storage locations across instances You can manage access through LaminHub's fine-grained access management or through AWS policies that you attach to your S3 bucket. To enable access management via LaminHub, head over to `https://lamin.ai/{account}/infrastructure`. By clicking the green button that says "Connect S3 bucket", your collaborators will access data based on their LaminHub permissions. :doc:`docs:permissions` has more details. .. image:: https://lamin-site-assets.s3.amazonaws.com/.lamindb/ze8hkgVxVptSSZEU0000.png :width: 800px By default, a storage location inherits the access permissions of its instance. If you want to further restrict access to a storage location, you can move it into a space:: space = ln.Space.get(name="my-space") storage_loc = ln.Storage.get(root="s3://my-storage-location") storage_loc.space = space storage_loc.save() If you don't want to store data in the cloud, you can use local storage locations: :doc:`faq/keep-artifacts-local`. Args: root: `str` The root path of the storage location, e.g., `"./mydir"`, `"s3://my-bucket"`, `"s3://my-bucket/myfolder"`, `"gs://my-bucket/myfolder"`, `"/nfs/shared/datasets/genomics"`, `"/weka/shared/models/"`, ... description: `str | None = None` An optional description. space: `Space | None = None` A space to restrict access permissions to the storage location. host: `str | None = None` For local storage locations, a globally unique identifier for the physical machine/server hosting the storage. This distinguishes storage locations that may have the same local path but exist on different servers, e.g. `"my-institute-cluster-1"`, `"my-server-abcd"`. See Also: :attr:`lamindb.core.Settings.storage` Current default storage location of your compute session for writing artifacts. :attr:`~lamindb.setup.core.StorageSettings` Storage settings. :doc:`faq/keep-artifacts-local` Avoid storing artifacts in the cloud, but keep them on local infrastructure. Examples: When you create a LaminDB instance, you configure its default storage location via `--storage`:: lamin init --storage ./mydatadir # or "s3://my-bucket/myfolder", "gs://my-bucket/myfolder", ... View the current default storage location for writing artifacts:: import lamindb as ln ln.settings.storage Create a new cloud storage location:: ln.Storage(root="s3://our-bucket/our-folder").save() Create a new local storage location:: ln.Storage(root="/dir/our-shared-dir", host="our-server-123").save() Globally switch to another storage location:: ln.settings.storage = "/dir/our-shared-dir" # or "s3://our-bucket/our-folder", "gs://our-bucket/our-folder", ... Or if you're operating in `keep-artifacts-local` mode (:doc:`faq/keep-artifacts-local`):: ln.settings.local_storage = "/dir/our-other-shared-dir" View all storage locations used in your LaminDB instance:: ln.Storage.to_dataframe() Notes: .. dropdown:: What is the `.lamindb/` directory inside a storage location? It stores all artifacts that are ingested through `lamindb`, indexed by the artifact `uid`. This means you don't have to worry about renaming or moving files, as this all happens on the database level. Existing artifacts are typically stored in hierarchical structures with semantic folder names. Instead of copying such artifacts into `.lamindb/` upon calls of `Artifact("legacy_path").save()`, LaminDB registers them with the semantic `key` representing the relative path within the storage location. These artifacts are marked with `artifact._key_is_virtual = False` and treated correspondingly. There is only a single `.lamindb/` directory per storage location. .. dropdown:: What should I do if I want to bulk migrate all artifacts to another storage? Currently, you can only achieve this manually and you should be careful with it. 1. Copy or move artifacts into the desired new storage location 2. Adapt the corresponding record in the {class}`~lamindb.Storage` registry by setting the `root` field to the new location 3. If your LaminDB storage location is connected to the hub, you also need to update the storage record on the hub """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" _name_field: str = "root" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, max_length=12, default=base62_12, db_index=True ) """Universal id, valid across DB instances.""" root: str = CharField(db_index=True, unique=True) """Root path of storage (cloud or local path).""" description: str | None = TextField(null=True) """A description.""" type: StorageType = CharField(max_length=30, db_index=True) """Can be "local" vs. "s3" vs. "gs". Is auto-detected from the format of the `root` path.""" region: str | None = CharField(max_length=64, db_index=True, null=True) """Storage region for cloud storage locations. Host identifier for local storage locations.""" instance_uid: str | None = CharField(max_length=12, db_index=True, null=True) """The writing instance. Only the LaminDB instance with this `uid` can write to this storage location. This instance also governs the access permissions of the storage location unless the location is moved into a space. """ artifacts: Artifact """Artifacts contained in this storage location.""" @overload def __init__( self, root: str, *, description: str | None = None, space: Space | None = None, host: str | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args) self._old_space_id = self.space_id return None if args: assert len(args) == 1, ( # noqa: S101 "Storage can only be initialized with a single positional argument, the root path." ) kwargs["root"] = args[0] if "host" in kwargs: if "type" in kwargs: assert kwargs["type"] == "local", ( # noqa: S101 "type needs to be 'local' if host is set" ) else: kwargs["type"] = "local" assert get_storage_type(kwargs["root"]) == "local", ( # noqa: S101 "root must be a local path if host is set" ) assert "region" not in kwargs, "region must not be set if host is set" # noqa: S101 kwargs["region"] = kwargs.pop("host") storage_record = Storage.filter( root=kwargs["root"], region=kwargs["region"] ).one_or_none() else: storage_record = Storage.filter(root=kwargs["root"]).one_or_none() space = kwargs.get("space", None) if storage_record is not None: from .sqlrecord import init_self_from_db init_self_from_db(self, storage_record) self._old_space_id = self.space_id return None skip_mark_storage_root = kwargs.pop("skip_mark_storage_root", False) skip_preparation = kwargs.pop("_skip_preparation", False) if skip_preparation: assert space is None, "`space` must not be set if _skip_preparation is True" # noqa: S101 super().__init__(*args, **kwargs) return None space_uuid = None if space is not None: hub_space_record = select_space(space.uid) if hub_space_record is None: raise ValueError( "Please first create a space on the hub: https://docs.lamin.ai/access" ) space_uuid = UUID(hub_space_record["id"]) # instance_id won't take effect if # - there is no write access # - the storage location is already managed by another instance ssettings, _ = init_storage( kwargs["root"], instance_id=setup_settings.instance._id, instance_slug=setup_settings.instance.slug, register_hub=setup_settings.instance.is_on_hub, region=kwargs.get("region", None), # host was renamed to region already space_uuid=space_uuid, skip_mark_storage_root=skip_mark_storage_root, ) # ssettings performed validation and normalization of the root path kwargs["root"] = ssettings.root_as_str # noqa: S101 if "instance_uid" in kwargs: assert kwargs["instance_uid"] == ssettings.instance_uid # noqa: S101 else: kwargs["instance_uid"] = ssettings.instance_uid if ssettings._uid is not None: # need private attribute here kwargs["uid"] = ssettings._uid if "type" not in kwargs: kwargs["type"] = ssettings.type else: assert kwargs["type"] == ssettings.type # noqa: S101 if "region" in kwargs: assert kwargs["region"] == ssettings.region # noqa: S101 else: kwargs["region"] = ssettings.region is_managed_by_current_instance = ( ssettings.instance_uid == setup_settings.instance.uid ) if ssettings.instance_uid is not None and not is_managed_by_current_instance: is_managed_by_instance = ( f", is managed by instance with uid {ssettings.instance_uid}" ) else: is_managed_by_instance = "" hub_message = "" if setup_settings.instance.is_on_hub and is_managed_by_current_instance: instance_owner = setup_settings.instance.owner ui_url = setup_settings.instance.ui_url hub_message = f", see: {ui_url}/{instance_owner}/infrastructure" managed_message = ( "created managed" if is_managed_by_current_instance else "referenced read-only" ) logger.important( f"{managed_message} storage location at {kwargs['root']}{is_managed_by_instance}{hub_message}" ) super().__init__(**kwargs) self._old_space_id = self.space_id @property def host(self) -> str | None: """Host identifier for local storage locations. Is `None` for locations with `type != "local"`. A globally unique user-defined host identifier (cluster, server, laptop, etc.). """ if self.type != "local": return None return self.region @property def path(self) -> UPath: """Path. Uses the `.root` field and converts it into a `Path` or `UPath`. """ access_token = self._access_token if hasattr(self, "_access_token") else None return create_path(self.root, access_token=access_token) def save(self, *args, **kwargs): """Save the storage record.""" if hasattr(self, "_old_space_id") and self._old_space_id != self.space_id: update_storage_with_space(storage_lnid=self.uid, space_lnid=self.space.uid) super().save(*args, **kwargs) return self def delete(self, permanent: bool | None = None) -> None: # type: ignore # type ignore is there because we don't use a trash here unlike everywhere else """Delete the storage location. This errors in case the storage location is not empty. Unlike other `SQLRecord`-based registries, this does *not* move the storage record into the trash. Args: permanent: `False` raises an error, as soft delete is impossible. """ from .. import settings if permanent is False: raise ValueError( "Soft delete is not possible for Storage, " "use 'permanent=True' or 'permanent=None' for permanent deletion." ) assert not self.artifacts.exists(), ( "Cannot delete storage with artifacts in current instance." ) # noqa: S101 # the simple case of a read-only storage location if self.instance_uid != setup_settings.instance.uid: super(SQLRecord, self).delete() return None # now the complicated case of a written/managed storage location check_storage_is_empty(self.path) assert settings.storage.root_as_str != self.root, ( # noqa: S101 "Cannot delete the current storage location, switch to another." ) if setup_settings.user.handle != "anonymous": # only attempt if authenticated storage_records = get_storage_records_for_instance( # only query those storage records on the hub that are managed by the current instance setup_settings.instance._id ) for storage_record in storage_records: if storage_record["lnid"] == self.uid: assert storage_record["is_default"] in {False, None}, ( # noqa: S101 "Cannot delete default storage of instance." ) delete_storage_record(storage_record) ssettings = StorageSettings(self.root) if ssettings._mark_storage_root.exists(): ssettings._mark_storage_root.unlink( missing_ok=True # this is totally weird, but needed on Py3.11 ) super(SQLRecord, self).delete() ================================================ FILE: lamindb/models/transform.py ================================================ from __future__ import annotations import warnings from typing import TYPE_CHECKING, overload from django.db import models from django.db.models import CASCADE, PROTECT, Q from lamin_utils import logger from lamindb_setup.core.hashing import HASH_LENGTH, hash_file, hash_string from lamindb.base import deprecated from lamindb.base.fields import ( CharField, DateTimeField, ForeignKey, TextField, ) from lamindb.base.users import current_user_id from .._secret_redaction import redact_secrets_in_source_code from ..models._is_versioned import process_revises from ._is_versioned import IsVersioned, _adjust_is_latest_when_deleting_is_versioned from .run import Run, User from .sqlrecord import ( BaseSQLRecord, IsLink, SQLRecord, init_self_from_db, update_attributes, ) if TYPE_CHECKING: from datetime import datetime from pathlib import Path from lamindb.base.types import TransformKind from .artifact import Artifact from .block import TransformBlock from .project import Project, Reference from .query_manager import RelatedManager from .query_set import QuerySet from .record import Record from .ulabel import ULabel # does not inherit from TracksRun because the Transform # is needed to define a run class Transform(SQLRecord, IsVersioned): """Data transformations such as scripts, notebooks, functions, or pipelines. A `transform` can be a function, a script, a notebook, or a pipeline. If you execute a transform, you generate a run (:class:`~lamindb.Run`). A run has inputs and outputs. Pipelines are typically created with a workflow manager (Nextflow, Snakemake, Prefect, Flyte, Dagster, redun, Airflow, ...). Transforms are versioned so that a given transform version maps on a given source code version. .. dropdown:: Can I sync transforms to git? If you set the environment variable `LAMINDB_SYNC_GIT_REPO` or set `ln.settings.sync_git_repo`, a script-like transform is synced to its hashed state in a git repository upon calling `ln.track()`:: ln.settings.sync_git_repo = "https://github.com/laminlabs/lamindb" ln.track() If the hash isn't found in the git repository, an error is thrown. You can also create transforms that map pipelines via `Transform.from_git()`. The definition of transforms and runs is consistent with the OpenLineage specification where a `transform` would be called a "job" and a `run` a "run". Args: key: `str | None = None` A short name or path-like semantic key. kind: `TransformKind | None = "pipeline"` See :class:`~lamindb.base.types.TransformKind`. version: `str | None = None` A version string. description: `str | None = None` A description. reference: `str | None = None` A reference, e.g., a URL. reference_type: `str | None = None` A reference type, e.g., 'url'. source_code: `str | None = None` Source code of the transform. revises: `Transform | None = None` An old version of the transform. skip_hash_lookup: `bool = False` Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists. See Also: :func:`~lamindb.track` Track a script or notebook run. :class:`~lamindb.Run` Executions of transforms. Notes: - :doc:`docs:track` - :doc:`docs:redun` - :doc:`docs:nextflow` - :doc:`docs:snakemake` Examples: Create a transform by running `ln.track()` in a notebook or a script:: ln.track() Create a transform for a standalone function that acts as its own workflow:: @ln.flow() def my_workflow(): print("Hello, world!") Create a transform for a step in a workflow:: @ln.step() def my_step(): print("One step!") Create a transform for a pipeline:: transform = ln.Transform(key="Cell Ranger", version="7.2.0", kind="pipeline").save() Create a transform by saving a Python or shell script or a notebook via the CLI:: lamin save my_script.py lamin save my_script.sh lamin save my_notebook.ipynb """ class Meta(SQLRecord.Meta, IsVersioned.Meta): abstract = False app_label = "lamindb" unique_together = ("key", "hash") _len_stem_uid: int = 12 _len_full_uid: int = 16 _name_field: str = "key" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=_len_full_uid ) """Universal id.""" # the max length equals the max length of an S3 key & the artifact key key: str = CharField(db_index=True, max_length=1024) """A name or "/"-separated path-like string. All transforms with the same key are part of the same version family. """ # db_index on description because sometimes we query for equality in the case of artifacts description: str | None = TextField(null=True, db_index=True) """A description.""" kind: TransformKind = CharField( max_length=20, db_index=True, default="pipeline", ) """A string indicating the kind of transform (default `"pipeline"`). One of `"pipeline"`, `"notebook"`, `"script"`, or `"function"`. """ source_code: str | None = TextField(null=True) """Source code of the transform.""" hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True) """Hash of the source code.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) """Reference for the transform, e.g., a URL.""" reference_type: str | None = CharField(max_length=25, db_index=True, null=True) """Reference type of the transform, e.g., 'url'.""" environment: Artifact | None = models.ForeignKey( "Artifact", CASCADE, null=True, related_name="_environment_of_transforms" ) """An environment for executing the transform.""" plan: Artifact | None = models.ForeignKey( "Artifact", CASCADE, null=True, related_name="_plan_for_transforms", default=None, ) """An optional plan for executing this transform.""" runs: RelatedManager[Run] """Runs of this transform ← :attr:`~lamindb.Run.transform`.""" ulabels: RelatedManager[ULabel] = models.ManyToManyField( "ULabel", through="TransformULabel", related_name="transforms" ) """ULabel annotations of this transform ← :attr:`~lamindb.ULabel.transforms`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordTransform", related_name="linked_transforms" ) """This transform is linked in these records as a value ← :attr:`~lamindb.Record.linked_transforms`.""" records: RelatedManager[Record] """Records that annotate this transform ← :attr:`~lamindb.Record.transforms`.""" predecessors: RelatedManager[Transform] = models.ManyToManyField( "self", through="TransformTransform", symmetrical=False, related_name="successors", ) """Preceding transforms ← :attr:`~lamindb.Transform.successors`.""" successors: RelatedManager[Transform] """Subsequent transforms ← :attr:`~lamindb.Transform.predecessors`. Allows defining succeeding transforms. Is *not* necessary for data lineage, which is tracked automatically whenever an artifact or collection serves as an input for a run. """ projects: RelatedManager[Project] """Linked projects ← :attr:`~lamindb.Project.transforms`.""" references: RelatedManager[Reference] """Linked references ← :attr:`~lamindb.Reference.transforms`.""" created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" updated_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of last update to record.""" created_by: User = ForeignKey( User, PROTECT, default=current_user_id, related_name="created_transforms" ) """Creator of record ← :attr:`~lamindb.User.created_transforms`.""" ablocks: RelatedManager[TransformBlock] """Attached blocks ← :attr:`~lamindb.TransformBlock.transform`.""" @overload def __init__( self, key: str | None = None, kind: TransformKind | None = None, version: str | None = None, description: str | None = None, reference: str | None = None, reference_type: str | None = None, source_code: str | None = None, revises: Transform | None = None, skip_hash_lookup: bool = False, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None if args: raise ValueError( "Please only use keyword arguments to construct a Transform" ) key: str | None = kwargs.pop("key", None) description: str | None = kwargs.pop("description", None) revises: Transform | None = kwargs.pop("revises", None) version_tag: str | None = kwargs.pop("version_tag", kwargs.pop("version", None)) kind: TransformKind | None = kwargs.pop("kind", None) type: TransformKind | None = kwargs.pop("type", None) if type is not None: warnings.warn( "`type` argument of transform was renamed to `kind` and will be removed in a future release.", DeprecationWarning, stacklevel=2, ) kind = kind if kind is not None else (type if type is not None else "pipeline") reference: str | None = kwargs.pop("reference", None) reference_type: str | None = kwargs.pop("reference_type", None) branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) skip_hash_lookup: bool = kwargs.pop("skip_hash_lookup", False) using_key = kwargs.pop("using_key", None) # below is internal use that we'll hopefully be able to eliminate uid: str | None = kwargs.pop("uid") if "uid" in kwargs else None source_code: str | None = ( kwargs.pop("source_code") if "source_code" in kwargs else None ) if not len(kwargs) == 0: raise ValueError( "Only key, description, version, kind, type, revises, reference, " f"reference_type can be passed, but you passed: {kwargs}" ) if revises is None: # need to check uid before checking key if uid is not None: revises = ( Transform.objects.using(using_key) .filter(uid__startswith=uid[:-4], is_latest=True) .order_by("-created_at") .first() ) elif key is not None: candidate_for_revises = ( Transform.objects.using(using_key) .filter(~Q(branch_id=-1), key=key, is_latest=True) .order_by("-created_at") .first() ) if candidate_for_revises is not None: revises = candidate_for_revises if candidate_for_revises.source_code is None: # no source code was yet saved, return the same transform logger.important( "no source code was yet saved, returning existing transform with same key" ) uid = revises.uid if revises is not None and uid is not None and uid == revises.uid: if revises.key != key: logger.warning("ignoring inconsistent key") init_self_from_db(self, revises) update_attributes(self, {"description": description}) return None if revises is not None and key is not None and revises.key != key: logger.important(f"renaming transform {revises.key} to {key}") new_uid, version_tag, key, description, revises = process_revises( revises, version_tag, key, description, Transform ) # this is only because the user-facing constructor allows passing a uid # most others don't if uid is None: has_consciously_provided_uid = False uid = new_uid else: has_consciously_provided_uid = True hash = None if source_code is not None and not skip_hash_lookup: hash = hash_string(source_code) transform_candidate = Transform.objects.filter( ~Q(branch_id=-1), hash=hash, is_latest=True, ).first() if transform_candidate is not None: init_self_from_db(self, transform_candidate) update_attributes(self, {"description": description}) if key is not None and transform_candidate.key != key: logger.warning( f"key {self.key} on existing transform differs from passed key {key}, keeping original key; update manually if needed or pass skip_hash_lookup if you want to duplicate the transform" ) return None super().__init__( # type: ignore uid=uid, description=description, key=key, kind=kind, version_tag=version_tag, reference=reference, reference_type=reference_type, source_code=source_code, hash=hash, _has_consciously_provided_uid=has_consciously_provided_uid, revises=revises, branch=branch, branch_id=branch_id, space=space, space_id=space_id, ) @classmethod def from_git( cls, url: str, path: str, key: str | None = None, version: str | None = None, entrypoint: str | None = None, branch: str | None = None, description: str | None = None, skip_hash_lookup: bool = False, ) -> Transform: """Create a transform from a path in a git repository. Args: url: URL of the git repository. path: Path to the file within the repository. key: Optional key for the transform. version: Optional version tag to checkout in the repository. entrypoint: One or several optional comma-separated entrypoints for the transform. branch: Optional branch to checkout. description: Optional description for the transform. skip_hash_lookup: Skip the hash lookup so that a new transform is created even if a transform with the same hash already exists. Examples: Create from a Nextflow repo and auto-infer the commit hash from its latest version:: transform = ln.Transform.from_git( url="https://github.com/openproblems-bio/task_batch_integration", path="main.nf" ).save() Create from a Nextflow repo and checkout a specific version:: transform = ln.Transform.from_git( url="https://github.com/openproblems-bio/task_batch_integration", path="main.nf", version="v2.0.0" ).save() assert transform.version_tag == "v2.0.0" Create a *sliding transform* from a Nextflow repo's `dev` branch. Unlike a regular transform, a sliding transform doesn't pin a specific source code state, but adapts to whatever the referenced state on the branch is:: transform = ln.Transform.from_git( url="https://github.com/openproblems-bio/task_batch_integration", path="main.nf", branch="dev", version="dev", ).save() Notes: A regular transform pins a specific source code state through its commit hash:: transform.source_code #> repo: https://github.com/openproblems-bio/task_batch_integration #> path: main.nf #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb A sliding transform infers the source code state from a branch:: transform.source_code #> repo: https://github.com/openproblems-bio/task_batch_integration #> path: main.nf #> branch: dev If an entrypoint is provided, it is added to the source code below the path, e.g.:: transform.source_code #> repo: https://github.com/openproblems-bio/task_batch_integration #> path: main.nf #> entrypoint: myentrypoint #> commit: 68eb2ecc52990617dbb6d1bb5c7158d9893796bb Note that you can pass a comma-separated list of entrypoints to the `entrypoint` argument. """ from ..core._sync_git import get_and_validate_git_metadata url, commit_hash = get_and_validate_git_metadata(url, path, version, branch) if key is None: key = ( url.split("/")[-2] + "/" + url.split("/")[-1].replace(".git", "") + "/" + path ) logger.important(f"inferred key '{key}' from url & path") source_code = f"repo: {url}\npath: {path}" if entrypoint is not None: source_code += f"\nentrypoint: {entrypoint}" if branch is not None and version == branch: from urllib.parse import quote # sliding transform, no defined source code state source_code += f"\nbranch: {branch}" reference, reference_type = ( f"{url}/tree/{quote(branch, safe='')}/{path}", "url", ) else: # regular transform, defined source code state source_code += f"\ncommit: {commit_hash}" reference, reference_type = f"{url}/blob/{commit_hash}/{path}", "url" return Transform( key=key, kind="pipeline", version=version, description=description, reference=reference, reference_type=reference_type, source_code=source_code, skip_hash_lookup=skip_hash_lookup, ) @property def latest_run(self) -> Run: """The latest run of this transform.""" return self.runs.order_by("-started_at").first() @property @deprecated(new_name="kind") def type(self) -> TransformKind: return self.kind @type.setter def type(self, value: TransformKind): self.kind = value def view_lineage(self, with_successors: bool = False, distance: int = 5): """View lineage of transforms. Note that this only accounts for manually defined predecessors and successors. Auto-generate lineage through inputs and outputs of runs is not included. """ from .has_parents import view_parents return view_parents( record=self, field="key", with_children=with_successors, distance=distance, attr_name="predecessors", ) def _update_source_code_from_path(self, source_code_path: Path) -> None | str: _, transform_hash, _ = hash_file(source_code_path) # ignore hash_type for now source_code = source_code_path.read_text() source_code_to_store, redaction_count = redact_secrets_in_source_code( source_code ) if redaction_count > 0: logger.warning( f"redacted {redaction_count} secret-looking assignment(s) before persisting transform source code" ) if self.hash is not None: # check if the hash of the transform source code matches if transform_hash != self.hash: response = input( f"You are about to overwrite existing source code (hash '{self.hash}') for Transform('{self.uid}')." f" Proceed? (y/n) " ) if response == "y": self.source_code = source_code_to_store self.hash = transform_hash else: logger.warning("Please re-run `ln.track()` to make a new version") return "rerun-the-notebook" else: logger.debug("source code is already saved") else: self.source_code = source_code_to_store self.hash = transform_hash return None def _permanent_delete_transforms(transforms: Transform | QuerySet) -> None: """Execute bulk DELETE on transforms (runs, then transforms). Used by QuerySet and single-transform paths.""" from django.db.models import QuerySet as DjangoQuerySet from .project import TransformProject if isinstance(transforms, Transform): db = transforms._state.db or "default" qs = Transform.objects.using(db).filter(pk=transforms.pk) else: db = transforms.db or "default" qs = transforms objects = list(qs) if not objects: return _adjust_is_latest_when_deleting_is_versioned(objects) transform_ids = [o.pk for o in objects] TransformProject.objects.using(db).filter(transform_id__in=transform_ids).delete() Run.objects.using(db).filter(transform_id__in=transform_ids).delete(permanent=True) DjangoQuerySet.delete(qs) class TransformTransform(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) successor: Transform = ForeignKey( "Transform", CASCADE, related_name="links_predecessor" ) predecessor: Transform = ForeignKey( "Transform", CASCADE, related_name="links_successor" ) config: dict | None = models.JSONField(default=None, null=True) created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now() ) created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) class Meta: app_label = "lamindb" unique_together = ("successor", "predecessor") ================================================ FILE: lamindb/models/ulabel.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, overload import pgtrigger from django.conf import settings as django_settings from django.db import models from django.db.models import CASCADE, PROTECT from lamindb.base.fields import ( CharField, DateTimeField, ForeignKey, TextField, ) from lamindb.errors import FieldValidationError from ..base.uids import base62_8 from .can_curate import CanCurate from .feature import Feature from .has_parents import HasParents, _query_relatives from .run import Run, TracksRun, TracksUpdates, User, current_user_id from .sqlrecord import BaseSQLRecord, HasType, IsLink, SQLRecord, _get_record_kwargs from .transform import Transform if TYPE_CHECKING: from datetime import datetime from .artifact import Artifact from .block import ULabelBlock from .collection import Collection from .project import Project from .query_manager import RelatedManager from .query_set import QuerySet from .record import Record from .sqlrecord import Branch class ULabel(SQLRecord, HasType, HasParents, CanCurate, TracksRun, TracksUpdates): """Universal labels. It behaves like `Record`, just without the ability to link features. Args: name: `str` A name. description: `str | None = None` A description. reference: `str | None = None` For instance, an external ID or a URL. reference_type: `str | None = None` For instance, `"url"`. See Also: :class:`~lamindb.Record` Like `ULabel`, but with the ability to link features. Examples: Create a label and annotate an :class:`~lamindb.Artifact`:: train_split = ln.ULabel(name="train").save() artifact.ulabels.add(train_split) Query artifacts by label:: ln.Artifact.filter(ulabels=train_split).to_dataframe() Organize ulabels in a type hierarchy, based on the `type` field:: split_type = ln.ULabel(name="Split", is_type=True).save() train_split = ln.ULabel(name="train", type="split_type").save() The `type` hierarchy gives rise to a tree. If you need to model a full DAG-like **ontology**, use the `parents`/`children` fields:: cell_type = ln.Record(name="CellType", is_type=True).save() t_cell = ln.Record(name="T Cell", type=cell_type).save() cd4_t_cell = ln.Record(name="CD4+ T Cell", type=cell_type).save() t_cell.children.add(cd4_t_cell) If you work with basic biological entities like cell lines, cell types, tissues, consider building on the public biological ontologies in :mod:`bionty`, which work in the same way. """ class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta): abstract = False app_label = "lamindb" if ( django_settings.DATABASES.get("default", {}).get("ENGINE") == "django.db.backends.postgresql" ): triggers = [ pgtrigger.Trigger( name="prevent_ulabel_type_cycle", operation=pgtrigger.Update | pgtrigger.Insert, when=pgtrigger.Before, condition=pgtrigger.Condition("NEW.type_id IS NOT NULL"), func=""" -- Check for direct self-reference IF NEW.type_id = NEW.id THEN RAISE EXCEPTION 'Cannot set type: ulabel cannot be its own type'; END IF; -- Check for cycles in the type chain IF EXISTS ( WITH RECURSIVE type_chain AS ( SELECT type_id, 1 as depth FROM lamindb_ulabel WHERE id = NEW.type_id UNION ALL SELECT r.type_id, tc.depth + 1 FROM lamindb_ulabel r INNER JOIN type_chain tc ON r.id = tc.type_id WHERE tc.depth < 100 ) SELECT 1 FROM type_chain WHERE type_id = NEW.id ) THEN RAISE EXCEPTION 'Cannot set type: would create a cycle'; END IF; RETURN NEW; """, ), ] # also see raw SQL constraints for `is_type` and `type` FK validity in migrations _name_field: str = "name" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField( editable=False, unique=True, db_index=True, max_length=8, default=base62_8 ) """A universal random id, valid across DB instances.""" name: str = CharField(max_length=150, db_index=True) """Name or title of ulabel.""" type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="ulabels") """Type of ulabel, e.g., `"donor"`, `"split"`, etc. ← :attr:`~lamindb.ULabel.ulabels` Allows to group ulabels by type, e.g., all donors, all split ulabels, etc. """ ulabels: RelatedManager[ULabel] """ULabels of this type (can only be non-empty if `is_type` is `True`).""" description: str | None = TextField(null=True) """A description.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) """A simple reference like URL or external ID.""" reference_type: str | None = CharField(max_length=25, db_index=True, null=True) """Type of simple reference.""" parents: RelatedManager[ULabel] = models.ManyToManyField( "self", symmetrical=False, related_name="children" ) """Parent entities of this ulabel ← :attr:`~lamindb.ULabel.children`. For advanced use cases, you can build an ontology under a given `type`. Say, if you modeled `CellType` as a `ULabel`, you would introduce a type `CellType` and model the hiearchy of cell types under it. """ children: RelatedManager[ULabel] """Child entities of this ulabel. Reverse accessor for parents. """ transforms: RelatedManager[Transform] """The transforms annotated by this ulabel ← :attr:`~lamindb.Transform.ulabels`.""" runs: RelatedManager[Run] """The runs annotated by this ulabel ← :attr:`~lamindb.Run.ulabels`.""" artifacts: RelatedManager[Artifact] = models.ManyToManyField( "Artifact", through="ArtifactULabel", related_name="ulabels" ) """The artifacts annotated by this ulabel ← :attr:`~lamindb.Artifact.ulabels`.""" collections: RelatedManager[Collection] """The collections annotated by this ulabel ← :attr:`~lamindb.Collection.ulabels`.""" projects: RelatedManager[Project] """The projects annotating this ulabel ← :attr:`~lamindb.Project.ulabels`.""" branches: RelatedManager[Branch] """The branches annotated by this ulabel ← :attr:`~lamindb.Branch.ulabels`.""" linked_in_records: RelatedManager[Record] = models.ManyToManyField( "Record", through="RecordULabel", related_name="linked_ulabels", ) """Records linking this ulabel as a value ← :attr:`~lamindb.Record.linked_ulabels`.""" ablocks: RelatedManager[ULabelBlock] """Attached blocks ← :attr:`~lamindb.ULabelBlock.ulabel`.""" @overload def __init__( self, name: str, type: ULabel | None = None, is_type: bool = False, description: str | None = None, reference: str | None = None, reference_type: str | None = None, ): ... @overload def __init__( self, *db_args, ): ... def __init__( self, *args, **kwargs, ): if len(args) == len(self._meta.concrete_fields): super().__init__(*args, **kwargs) return None if len(args) > 0: raise ValueError("Only one non-keyword arg allowed") name: str = kwargs.pop("name", None) type: str | None = kwargs.pop("type", None) is_type: bool = kwargs.pop("is_type", False) description: str | None = kwargs.pop("description", None) reference: str | None = kwargs.pop("reference", None) reference_type: str | None = kwargs.pop("reference_type", None) branch = kwargs.pop("branch", None) branch_id = kwargs.pop("branch_id", 1) space = kwargs.pop("space", None) space_id = kwargs.pop("space_id", 1) _skip_validation = kwargs.pop("_skip_validation", False) _aux = kwargs.pop("_aux", None) if len(kwargs) > 0: valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(ULabel)]) raise FieldValidationError( f"Only {valid_keywords} are valid keyword arguments" ) super().__init__( name=name, type=type, is_type=is_type, description=description, reference=reference, reference_type=reference_type, branch=branch, branch_id=branch_id, space=space, space_id=space_id, _skip_validation=_skip_validation, _aux=_aux, ) def query_ulabels(self) -> QuerySet: """Query ulabels of sub types. While `.ulabels` retrieves the ulabels with the current type, this method also retrieves sub types and the ulabels with sub types of the current type. """ return _query_relatives([self], "ulabels") # type: ignore class ArtifactULabel(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="links_ulabel") ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, related_name="links_artifactulabel", default=None ) class Meta: # can have the same label linked to the same artifact if the feature is # different app_label = "lamindb" unique_together = ("artifact", "ulabel", "feature") class TransformULabel(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel") ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform") class Meta: app_label = "lamindb" unique_together = ("transform", "ulabel") class RunULabel(BaseSQLRecord, IsLink): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="links_ulabel") ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_run") created_at: datetime = DateTimeField( editable=False, db_default=models.functions.Now(), db_index=True ) """Time of creation of record.""" created_by: User = ForeignKey( "lamindb.User", PROTECT, default=current_user_id, related_name="+" ) """Creator of record.""" class Meta: app_label = "lamindb" unique_together = ("run", "ulabel") class BranchULabel(BaseSQLRecord, IsLink): """Link model for branch–ulabel association.""" id: int = models.BigAutoField(primary_key=True) branch: Branch = ForeignKey("Branch", CASCADE, related_name="links_ulabel") ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_branch") class Meta: app_label = "lamindb" unique_together = ("branch", "ulabel") class CollectionULabel(BaseSQLRecord, IsLink, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( "Collection", CASCADE, related_name="links_ulabel" ) ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_collection") feature: Feature | None = ForeignKey( Feature, PROTECT, null=True, related_name="links_collectionulabel", default=None ) class Meta: app_label = "lamindb" unique_together = ("collection", "ulabel") ================================================ FILE: lamindb/py.typed ================================================ ================================================ FILE: lamindb/setup/__init__.py ================================================ import lamindb_setup as _lamindb_setup from lamindb_setup import * # noqa: F403 from lamindb_setup import ( connect, delete, init, settings, ) from . import core, errors, types from ._merge import merge # noqa: F401 from ._switch import switch # noqa: F401 del connect # we have this at the root level, hence, we don't want it here __doc__ = _lamindb_setup.__doc__.replace("lamindb_setup", "lamindb.setup") settings.__doc__ = settings.__doc__.replace("lamindb_setup", "lamindb.setup") ================================================ FILE: lamindb/setup/_merge.py ================================================ # Tested in lamin-cli (tests/core/test_create_switch_delete_list_settings.py::test_merge*). from __future__ import annotations from typing import TYPE_CHECKING import lamindb_setup as ln_setup from django.apps import apps from django.db import connection from django.db.utils import DatabaseError from lamin_utils import logger if TYPE_CHECKING: from lamindb.models import Branch def merge(branch: str | Branch) -> None: """Merge a branch into the current branch. All `SQLRecord` objects that have `branch_id` equal to the source branch's id are updated to the current branch's id. Find more info in the :class:`~lamindb.Branch` document. Args: branch: The source branch to merge from. Accepts a `name`, a `uid`, or the `Branch` object. Raises: DoesNotExist: If the branch does not exist. """ from lamindb import Branch, Q from lamindb.errors import ObjectDoesNotExist from ..models import SQLRecord from ..models._is_versioned import IsVersioned, reconcile_is_latest_within_branch from ..models.sqlrecord import BRANCH_SENSITIVE_BLOCK_MODEL_NAMES if isinstance(branch, Branch): source = branch if source._state.adding: raise ObjectDoesNotExist("Branch must be saved.") else: source = Branch.filter(Q(name=branch) | Q(uid=branch)).one_or_none() if source is None: raise ObjectDoesNotExist(f"Branch '{branch}' not found.") current = ln_setup.settings.branch if current.id == source.id: logger.important("already on branch, nothing to merge") return sqlrecord_models = [ m for m in apps.get_models() if issubclass(m, SQLRecord) and not m._meta.abstract ] attached_block_models = [ model for model_name in sorted(BRANCH_SENSITIVE_BLOCK_MODEL_NAMES) if (model := apps.get_model("lamindb", model_name)) is not None ] models = list(dict.fromkeys([*sqlrecord_models, *attached_block_models])) if not models: return vendor = connection.vendor quoted_tables = [connection.ops.quote_name(m._meta.db_table) for m in models] with connection.cursor() as cursor: if vendor == "postgresql": # Single round-trip: one multi-statement execute statements = [ f"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s" for tbl in quoted_tables ] sql = "BEGIN; " + "; ".join(statements) + "; COMMIT;" params = [current.id, source.id] * len(quoted_tables) try: cursor.execute(sql, params) except DatabaseError as e: logger.error(f"Merge failed: {e}") raise else: # SQLite: execute() runs only the first statement; run each UPDATE # in a loop (same connection, so still one transaction if we're inside # a transaction or use autocommit-off). from django.db import transaction with transaction.atomic(): for tbl in quoted_tables: # Django uses %s; SQLite backend converts to ? cursor.execute( f"UPDATE {tbl} SET branch_id = %s WHERE branch_id = %s", [current.id, source.id], ) versioned_models = [m for m in models if issubclass(m, IsVersioned)] for model in versioned_models: reconcile_is_latest_within_branch(model, branch_id=current.id) source._status_code = -1 # merged source.save(update_fields=["_status_code"]) logger.important(f"merged branch '{source.name}' into '{current.name}'") ================================================ FILE: lamindb/setup/_switch.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING from lamin_utils import logger from lamindb_setup import settings if TYPE_CHECKING: from lamindb.models import Branch def switch(target: str | Branch, *, space: bool = False, create: bool = False): """Switch to a branch or space, create if not exists. Args: target: Branch target or space target to switch to. space: If True, switch space; otherwise switch branch. create: If True and switching branch, create the branch if it does not exist. """ if space: settings.space = target else: if create: from lamindb import Branch, Q from lamindb.errors import BranchAlreadyExists # Consistent with git switch -c: error if branch already exists. existing = Branch.filter(Q(name=target) | Q(uid=target)).one_or_none() if existing is not None: raise BranchAlreadyExists( f"Branch '{target}' already exists. Omit -c/--create to switch to it." ) Branch(name=target).save() logger.important(f"created branch: {target}") settings.branch = target logger.important(f"switched to {target}") ================================================ FILE: lamindb/setup/core/__init__.py ================================================ import lamindb_setup as _lamindb_setup from lamindb_setup.core import * # noqa: F403 __doc__ = _lamindb_setup.core.__doc__.replace("lamindb_setup", "lamindb.setup") ================================================ FILE: lamindb/setup/errors/__init__.py ================================================ import lamindb_setup as _lamindb_setup from lamindb_setup.errors import * # noqa: F403 __doc__ = _lamindb_setup.errors.__doc__.replace("lamindb_setup", "lamindb.setup") ================================================ FILE: lamindb/setup/types/__init__.py ================================================ import lamindb_setup as _lamindb_setup from lamindb_setup.types import * # noqa: F403 __doc__ = _lamindb_setup.types.__doc__.replace("lamindb_setup", "lamindb.setup") ================================================ FILE: lamindb_full.py ================================================ """Full/meta-package module for the `lamindb` distribution.""" from __future__ import annotations import re from pathlib import Path _INIT_FILE = Path(__file__).parent / "lamindb" / "__init__.py" _MATCH = re.search(r'__version__\s*=\s*"([^"]+)"', _INIT_FILE.read_text()) if _MATCH is None: raise RuntimeError(f"Could not parse __version__ from {_INIT_FILE}") __version__ = _MATCH.group(1) ================================================ FILE: noxfile.py ================================================ import os import shutil from pathlib import Path import nox from laminci import convert_executable_md_files, upload_docs_artifact from laminci.nox import ( build_docs, login_testuser1, login_testuser2, run, run_pre_commit, ) # we'd like to aggregate coverage information across sessions # and for this the code needs to be located in the same # directory in every github action runner # this also allows to break out an installation section nox.options.default_venv_backend = "none" IS_PR = os.getenv("GITHUB_EVENT_NAME") != "push" CI = os.environ.get("CI") # SpatialData.write() regression with ome-zarr>=0.14: # https://github.com/scverse/spatialdata/issues/1090 SPATIALDATA_OME_ZARR_CONSTRAINT = "ome-zarr<0.14.0" GROUPS = {} GROUPS["tutorial"] = [ "README.ipynb", "sync.ipynb", "arrays.ipynb", "registries.ipynb", ] GROUPS["guide"] = [ "track.ipynb", ] GROUPS["tiledbsoma"] = [ "curate.ipynb", ] GROUPS["biology"] = [ "manage-ontologies.ipynb", ] @nox.session def lint(session: nox.Session) -> None: run_pre_commit(session) @nox.session def install(session): base_deps = [ "./sub/lamin-cli", "./sub/lamindb-setup", "./sub/bionty", ] top_deps = [ ".[full,dev]", ] cmds = [ f"uv pip install {'--system' if CI else ''} --no-cache-dir {' '.join(base_deps)}", ] + [ f"uv pip install {'--system' if CI else ''} --no-cache-dir -e {dep}" for dep in top_deps ] [run(session, line) for line in cmds] @nox.session @nox.parametrize( "group", [ "unit-core-sqlite", "unit-core-postgres", "unit-storage", "no-instance", "tutorial", "guide", "tiledbsoma", "biology", "faq", "storage", "curator", "integrations", "docs", "cli", "permissions", ], ) def install_ci(session, group): extras = "" if group in ["unit-core-sqlite", "unit-core-postgres"]: extras += "fcs" run(session, "uv pip install --system scanpy") run(session, "uv pip install --system mudata") # spatialdata dependency, specifying it here explicitly # otherwise there are problems with uv resolver run(session, "uv pip install --system xarray-dataclasses") run( session, f"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}", ) elif group == "unit-storage": extras += "gcp" run(session, "uv pip install --system huggingface_hub") run(session, "uv pip install --system scanpy") run(session, "uv pip install --system polars") elif group == "tutorial": # anndata here to prevent installing older version on release run(session, "uv pip install --system huggingface_hub polars anndata==0.12.2") elif group == "guide": extras += "zarr_v2" run( session, f"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}", ) elif group == "tiledbsoma": extras += "zarr_v2" run( session, f"uv pip install --system scanpy mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} tiledbsoma", ) elif group == "biology": extras += "fcs" run(session, "uv pip install --system ipywidgets") elif group == "faq": extras += "zarr_v2" elif group == "storage": extras += "zarr_v2" run( session, "uv pip install --system --no-deps ./sub/pertdb", ) run(session, "uv pip install --system vitessce") elif group == "curator": run( session, "uv pip install --system --no-deps ./sub/pertdb", ) # spatialdata dependency, specifying it here explicitly # otherwise there are problems with uv resolver run(session, "uv pip install --system xarray-dataclasses") run( session, f"uv pip install --system spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT}", ) elif group == "integrations": run(session, "uv pip install --system lightning") elif group == "docs": extras += "zarr_v2" # spatialdata dependency, specifying it here explicitly # otherwise there are problems with uv resolver run(session, "uv pip install --system xarray-dataclasses") run( session, f"uv pip install --system mudata spatialdata {SPATIALDATA_OME_ZARR_CONSTRAINT} lightning", ) run( session, "uv pip install --system --no-deps ./sub/pertdb", ) elif group == "cli": pass elif group == "permissions": pass extras = "," + extras if extras != "" else extras run(session, f"uv pip install --system -e .[full,dev{extras}]") # on the release branch, do not use submodules but run with pypi install # only exception is the docs group which should always use the submodule # to push docs fixes fast # installing this after lamindb to be sure that these packages won't be reinstaled # during lamindb installation if IS_PR or group == "docs": run( session, "uv pip install --system ./sub/lamindb-setup ./sub/lamin-cli ./sub/bionty ./sub/pertdb", ) if group == "permissions": # have to install after lamindb installation # because lamindb downgrades django required by laminhub_rest cmds = "uv pip install --system ./laminhub/backend" cmds += "\nuv pip install --system ./laminhub/backend/utils" cmds += "\nuv pip install --system ./laminhub/backend/services/central" cmds += "\nuv pip install --system ./laminhub/backend/services/instancedb" cmds += "\nuv pip install --system ./laminhub/backend/services/aws" cmds += "\nuv pip install --system --no-deps ./laminhub/backend/services/instancedb/hubmodule" [run(session, line) for line in cmds.splitlines()] @nox.session def configure_coverage(session) -> None: """Write a coverage config file, adding extra patterns to omit.""" import tomlkit groups_str = session.posargs[0] # first positional argument print(groups_str) # for debugging # so that we don't change this away from string assert isinstance(groups_str, str) # noqa: S101 if "curator" not in groups_str and "tiledbsoma" not in groups_str: extra_omit_patterns = ["**/curators/*"] else: extra_omit_patterns = [] # Read patterns from pyproject.toml base_config_path = Path("pyproject.toml") with open(base_config_path) as f: config = tomlkit.load(f) # Update the omit patterns base_patterns = config["tool"]["coverage"]["run"]["omit"] all_patterns = base_patterns + extra_omit_patterns config["tool"]["coverage"]["run"]["omit"] = all_patterns # Write back to pyproject.toml with open(base_config_path, "w") as f: tomlkit.dump(config, f) print(base_config_path.read_text()) @nox.session def prepare(session): """Create executable files to run during a test session. Is not needed for unit tests! """ content = open("README.md").read() # cannot execute the flow after ln.track() was called content = content.replace(" create_fasta()", " pass") open("README_stripped.md", "w").write( "\n".join( line for line in content.split("\n") if not line.strip().startswith( ("accessor = artifact.open()", "ln.track(project=", "ln.Project(name=") ) ) ) os.system("jupytext README_stripped.md --to notebook --output ./docs/README.ipynb") convert_executable_md_files() os.system("cp ./tests/core/test_artifact_parquet.py ./docs/scripts/") os.system("cp ./lamindb/examples/schemas/define_valid_features.py ./docs/scripts/") os.system( "cp ./lamindb/examples/schemas/define_schema_anndata_ensembl_gene_ids_and_valid_features_in_obs.py ./docs/scripts/" ) os.system( "cp ./lamindb/examples/datasets/define_mini_immuno_features_labels.py ./docs/scripts/" ) os.system( "cp ./lamindb/examples/datasets/define_mini_immuno_schema_flexible.py ./docs/scripts/" ) os.system( "cp ./lamindb/examples/datasets/save_mini_immuno_datasets.py ./docs/scripts/" ) @nox.session @nox.parametrize( "group", [ "unit-core-sqlite", "unit-core-postgres", "unit-storage", "no-instance", "curator", "integrations", "tutorial", "guide", "tiledbsoma", "biology", "faq", "storage", "cli", "permissions", ], ) def test(session, group): # we likely don't need auth in many other groups, but have to carefully expand this if group not in {"curator", "no-instance"}: login_testuser2(session) login_testuser1(session) # this is mostly needed for the docs so that we don't render Django's entire public API run(session, "lamin settings set private-django-api true") coverage_args = "--cov=lamindb --cov-config=pyproject.toml --cov-append --cov-report=term-missing" duration_args = "--durations=10" env = os.environ.copy() if group == "unit-core-sqlite": env["LAMINDB_TEST_DB_VENDOR"] = "sqlite" run( session, f"pytest {coverage_args} ./tests/core {duration_args}", env=env, ) elif group == "unit-core-postgres": env["LAMINDB_TEST_DB_VENDOR"] = "postgresql" run( session, f"pytest {coverage_args} ./tests/core {duration_args}", env=env, ) elif group == "unit-storage": login_testuser2(session) # shouldn't be necessary but is for now run(session, f"pytest {coverage_args} ./tests/storage {duration_args}") elif group == "no-instance": run(session, "lamin disconnect") run(session, f"pytest {coverage_args} ./tests/no_instance {duration_args}") elif group == "tutorial": run(session, "lamin logout") run(session, "lamin init --storage ./test-readme --modules bionty") run( session, f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}" ) elif group == "guide": run( session, f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}", ) elif group == "tiledbsoma": run( session, ( f"pytest {coverage_args} tests/tiledbsoma " "./docs/test_notebooks.py::test_tiledbsoma " f"{duration_args}" ), ) elif group == "biology": run( session, f"pytest -s {coverage_args} ./docs/test_notebooks.py::test_{group}", ) elif group == "faq": run(session, f"pytest -s {coverage_args} ./docs/faq") elif group == "storage": run(session, f"pytest -s {coverage_args} ./docs/storage") elif group == "curator": run( session, f"pytest {coverage_args} tests/curators {duration_args}", ) elif group == "integrations": run(session, f"pytest -s {coverage_args} tests/integrations") elif group == "cli": run( session, f"pytest {coverage_args} ./sub/lamin-cli/tests/core {duration_args}", ) elif group == "permissions": run(session, f"pytest {coverage_args} ./tests/permissions") # move artifacts into right place if group in {"tutorial", "guide", "tiledbsoma", "biology"}: target_dir = Path(f"./docs/{group}") target_dir.mkdir(exist_ok=True) for filename in GROUPS[group]: shutil.copy(Path("docs") / filename, target_dir / filename) @nox.session def clidocs(session): def generate_cli_docs(): os.environ["NO_RICH"] = "1" from lamin_cli.__main__ import COMMAND_GROUPS, _generate_help page = "# CLI\n\n" helps = _generate_help() # First, add the main lamin command main_help = helps.get("main") if main_help: help_string = main_help["help"].replace("Usage: main", "Usage: lamin") help_docstring = main_help["docstring"] if help_docstring: page += f"{help_docstring}\n\n" # below is ugly # page += f"```text\n{help_string}\n```\n\n" # Create a mapping of command names to their full keys in helps command_to_key = {} for name in helps.keys(): names = name.split(" ") if len(names) == 2: # e.g., "lamin connect" command_name = names[1] command_to_key[command_name] = name # Group commands by their categories command_groups = COMMAND_GROUPS.get("lamin", []) processed_commands = set() for group in command_groups: group_name = group["name"] group_commands = group["commands"] page += f"## {group_name}\n\n" for command_name in group_commands: if command_name in command_to_key: full_key = command_to_key[command_name] help_dict = helps[full_key] processed_commands.add(command_name) help_string = help_dict["help"].replace("Usage: main", "lamin") help_docstring = help_dict["docstring"] pyr_alt_delimiter = "→ Python/R alternative:" if pyr_alt_delimiter in help_docstring: help_docstring, pyr_alt_string = help_docstring.split( pyr_alt_delimiter ) else: pyr_alt_string = "" page += f"### {command_name}\n\n" if help_docstring: page += f"{help_docstring}\n" command_block = f"```text\n{help_string}\n```" page += f"\n\nOptions:\n\n{command_block}\n\n" if pyr_alt_string: page += f"{pyr_alt_delimiter}{pyr_alt_string}\n\n" # Add any remaining commands that aren't in groups remaining_commands = [] for command_name, full_key in command_to_key.items(): if command_name not in processed_commands: remaining_commands.append((command_name, full_key)) if remaining_commands: page += "## Other\n\n" for command_name, full_key in remaining_commands: help_dict = helps[full_key] help_string = help_dict["help"].replace("Usage: main", "Usage: lamin") help_docstring = help_dict["docstring"] page += f"### lamin {command_name}\n\n" if help_docstring: page += f"{help_docstring}\n\n" page += f"```text\n{help_string}\n```\n\n" Path("./docs/cli.md").write_text(page) generate_cli_docs() @nox.session def docs(session): # move artifacts into right place run(session, "lamin settings set private-django-api true") for group in ["tutorial", "guide", "tiledbsoma", "biology", "faq", "storage"]: if Path(f"./docs-{group}").exists(): if Path(f"./docs/{group}").exists(): shutil.rmtree(f"./docs/{group}") Path(f"./docs-{group}").rename(f"./docs/{group}") # move back to root level if group in {"tutorial", "guide", "tiledbsoma", "biology"}: for path in Path(f"./docs/{group}").glob("*"): path.rename(f"./docs/{path.name}") run( session, "lamin init --storage ./docsbuild --modules bionty,pertdb", ) build_docs(session, strip_prefix=True, strict=False) upload_docs_artifact() ================================================ FILE: pyproject.full.toml ================================================ [build-system] requires = ["flit_core >=3.2,<4"] build-backend = "flit_core.buildapi" [project] name = "lamindb" requires-python = ">=3.10,<=3.14" authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}] readme = "README.md" dynamic = ["version", "description"] classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ] dependencies = [ "lamindb-core[full]==2.4.2", ] [project.urls] Home = "https://github.com/laminlabs/lamindb" [project.optional-dependencies] gcp = [ "lamindb_setup[gcp]", ] zarr_v2 = [ "numcodecs<0.16.0", # 0.16.0 breaks zarr<3.0.* "zarr>=2.16.0,<3.0.0a0", # not yet compatible with 3.0.* ] fcs = [ "readfcs>=2.0.1", ] dev = [ # basic test "tomlkit", "line_profiler", "pre-commit", "nox", "laminci>=0.3", "pytest>=6.0", "coverage", "pytest-cov<7.0.0", # v7 drops support for subprocess measurement "mudata", # others "nbproject_test>=0.6.0", # biology "faker-biology", # bionty "pronto", ] [tool.flit.module] name = "lamindb_full" [tool.flit.sdist] exclude = [ "sub/" ] ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["flit_core >=3.2,<4"] build-backend = "flit_core.buildapi" [project] name = "lamindb-core" requires-python = ">=3.10,<=3.14" authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}] readme = "README.md" dynamic = ["version", "description"] classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ] dependencies = [ "lamin_utils==0.16.4", # no dependencies "lamin_cli==1.16.0", # no dependencies "lamindb_setup[aws]==1.25a1", # dependencies like Django & fsspec "psycopg2-binary", ] [project.urls] Home = "https://github.com/laminlabs/lamindb" [project.optional-dependencies] # full: keep in sync with pyproject.full.toml dependencies (excluding lamindb-core). # If you change duplicated deps here, update pyproject.full.toml too, and vice versa. full = [ # LaminDB optional modules, included to avoid users forgetting about extras "bionty>=2.3.1,<3", # 30kB pure python, no dependencies "pertdb>=2.2.0,<3", # 30kB pure python, no dependencies # Jupyter -- small packages with few & small dependencies "jupytext", "nbconvert>=7.2.1", # bound to avoid lxml[html_clean] dependency "nbproject==0.11.1", # adds orjson # Data & validation dependencies (heavier) "pyarrow", "pandera>=0.24.0", "pandas>=2.0.0,<3.0.0", # for .infer_objects(copy=False) in lamin-utils; not yet compatible with Pandas 3.0.0 "anndata>=0.10.0,<=0.12.10", # backed sparse is incompatible with scipy 1.15.0 for anndata 1.11.1 # Runtime utilities "graphviz", "scipy<1.17.0", # 1.17.0 is incompatible with anndata<0.12.7 "pyyaml", "typing_extensions!=4.6.0", "python-dateutil", ] gcp = [ "lamindb_setup[gcp]", ] zarr_v2 = [ "numcodecs<0.16.0", # 0.16.0 breaks zarr<3.0.* "zarr>=2.16.0,<3.0.0a0", # not yet compatible with 3.0.* ] fcs = [ "readfcs>=2.0.1", ] dev = [ # basic test "tomlkit", "line_profiler", "pre-commit", "nox", "laminci>=0.3", "pytest>=6.0", "coverage", "pytest-cov<7.0.0", # v7 drops support for subprocess measurement "mudata", # others "nbproject_test>=0.6.0", # biology "faker-biology", # bionty "pronto", ] [tool.flit.module] name = "lamindb" [tool.ruff] src = ["src"] line-length = 88 lint.select = [ "F", # Errors detected by Pyflakes "E", # Error detected by Pycodestyle "W", # Warning detected by Pycodestyle "I", # isort "D", # pydocstyle "B", # flake8-bugbear "TID", # flake8-tidy-imports "C4", # flake8-comprehensions "BLE", # flake8-blind-except "UP", # pyupgrade "RUF100", # Report unused noqa directives "TCH", # Typing imports "NPY", # Numpy specific rules "PTH", # Use pathlib "S" # Security ] lint.ignore = [ # Do not catch blind exception: `Exception` "BLE001", # Errors from function calls in argument defaults. These are fine when the result is immutable. "B008", # line too long -> we accept long comment lines; black gets rid of long code lines "E501", # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient "E731", # allow I, O, l as variable names -> I is the identity matrix "E741", # Missing docstring in public module "D100", # undocumented-public-class "D101", # Missing docstring in public method "D102", # Missing docstring in public function "D103", # Missing docstring in public package "D104", # __magic__ methods are are often self-explanatory, allow missing docstrings "D105", # Missing docstring in public nested class "D106", # Missing docstring in __init__ "D107", "D405", "D214", "D416", ## Disable one in each pair of mutually incompatible rules # We don’t want a blank line before a class docstring "D203", # 1 blank line required after class docstring "D204", # first line should end with a period [Bug: doesn't work with single-line docstrings] # We want docstrings to start immediately after the opening triple quote "D213", # Section underline is over-indented ("{name}") "D215", # First line should be in imperative mood; try rephrasing "D401", # First word of the first line should be capitalized: {} -> {} "D403", # First word of the docstring should not be "This" "D404", # Section name should end with a newline ("{name}") "D406", # Missing dashed underline after section ("{name}") "D407", # Section underline should be in the line following the section's name ("{name}") "D408", # Section underline should match the length of its name ("{name}") "D409", # No blank lines allowed between a section header and its content ("{name}") "D412", # Missing blank line after last section ("{name}") "D413", # Missing argument description in the docstring "D417", # camcelcase imported as lowercase "N813", # module import not at top level of file "E402", # open()` should be replaced by `Path.open() "PTH123", # subprocess` call: check for execution of untrusted input - https://github.com/PyCQA/bandit/issues/333 "S603", # Starting a process with a partial executable path "S607", # Prefer absolute imports over relative imports from parent modules "TID252", # Asserts "S101", # Standard pseudo-random generators are not suitable for cryptographic purposes "S311", # Starting a process with a shell: seems safe, but may be changed in the future; consider rewriting without `shell` "S605", # Possible SQL injection vector through string-based query construction "S608", # All of the below TODO 3.10 refactor, temporarily disable "S602", "UP007", "UP038", "B905", "UP035", "RUF100", ] [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.per-file-ignores] "docs/*" = ["I", "S101"] "tests/**/*.py" = [ "D", # docstrings are allowed to look a bit off "S101", # asserts allowed in tests... "ARG", # Unused function args -> fixtures nevertheless are functionally relevant... "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize() "PLR2004", # Magic value used in comparison, ... "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes ] "tests/**/*.ipynb" = ["S101"] "*/__init__.py" = ["F401"] "lamindb/core/types.py" = ["F401"] [tool.pytest.ini_options] testpaths = [ "tests", ] filterwarnings = [ "ignore::SyntaxWarning:pronto", "ignore:::pronto.ontology", "ignore::UserWarning:xarray_schema", "ignore::DeprecationWarning:botocore.*", "ignore::DeprecationWarning:xarray_schema", "ignore::DeprecationWarning:geopandas", "ignore::DeprecationWarning:tiledbsoma", "ignore::DeprecationWarning:pkg_resources", "ignore::FutureWarning:spatialdata", "ignore::FutureWarning:mudata", "ignore::UserWarning:anndata", "ignore:Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning", "ignore:The 'train_dataloader' does not have many workers:UserWarning", ] markers = [ "pg_integration: tests that require an external PostgreSQL instance" ] [tool.coverage.report] exclude_lines = [ "if TYPE_CHECKING:", "@abstractmethod", "@abc.abstractmethod" ] [tool.coverage.run] omit = ["**/examples/datasets/*", "**/migrations/*", "**/curators/_legacy.py", "**/core/_compat.py", "**/core/types.py"] [tool.flit.sdist] exclude = [ "sub/" ] ================================================ FILE: scripts/migrate_test_instances.py ================================================ #!/usr/bin/env python3 """Migrate all LaminDB instances used in lamindb tests. For each instance: connect, run migrations, create storage snapshot. Run from repo root with: python scripts/migrate_test_instances.py """ import subprocess import sys INSTANCES = [ "laminlabs/lamin-site-assets", "laminlabs/lamin-dev", "laminlabs/lamindata", "laminlabs/cellxgene", "laminlabs/bionty-assets", "laminlabs/pertdata", ] def run(cmd: str) -> None: result = subprocess.run(cmd, shell=True) if result.returncode != 0: sys.exit(result.returncode) def main() -> None: for instance in INSTANCES: print(f"=== Migrating {instance} ===") run(f"lamin connect {instance}") run("lamin migrate deploy") run("lamin io snapshot") print() print("Done. All test instances migrated and snapshotted.") if __name__ == "__main__": main() ================================================ FILE: tests/core/_dataset_fixtures.py ================================================ from pathlib import Path from typing import Generator import lamindb as ln import numpy as np import pandas as pd import pytest from scipy.sparse import csr_matrix @pytest.fixture(scope="session") def get_small_adata(): # shouldn't need anndata installed to run tests import anndata as ad return ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) @pytest.fixture(scope="session") def get_small_mdata(): # shouldn't need mudata installed to run tests import anndata as ad import mudata as md adata1 = ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) adata2 = ad.AnnData( X=np.array([[7, 8], [9, 10]]), obs={"feat2": ["C", "D"]}, var=pd.DataFrame(index=["FOXP3", "CD8A"]), obsm={"X_umap": np.array([[5, 6], [7, 8]])}, ) return md.MuData({"rna": adata1, "protein": adata2}) @pytest.fixture(scope="session") def get_small_sdata(): # shouldn't need spatialdata installed to run tests import anndata as ad import spatialdata as sd adata = ad.AnnData( X=csr_matrix(np.array([[0.1, 0.2], [0.3, 0.4]])), obs=pd.DataFrame(index=["cell1", "cell2"]), var=pd.DataFrame(index=["gene1", "gene2"]), ) { "region1": np.array([[[0, 0], [0, 1], [1, 1], [1, 0]]]), "region2": np.array([[[2, 2], [2, 3], [3, 3], [3, 2]]]), } sdata_obj = sd.SpatialData( tables={"gene_expression": adata}, ) return sdata_obj @pytest.fixture(scope="session") def get_mini_csv() -> Generator[Path, None, None]: csv_path = ln.examples.datasets.file_mini_csv() yield csv_path Path("mini.csv").unlink(missing_ok=True) ================================================ FILE: tests/core/conftest.py ================================================ import os import shutil from pathlib import Path from subprocess import DEVNULL, run from time import perf_counter import anndata as ad import lamindb as ln import lamindb_setup as ln_setup import numpy as np import pandas as pd import pytest # for artifact fixtures import yaml # type: ignore from lamin_utils import logger from laminci.db import setup_local_test_postgres def pytest_sessionstart(): t_execute_start = perf_counter() ln_setup._TESTING = True os.environ["LAMIN_TESTING"] = "true" is_postgresql = os.getenv("LAMINDB_TEST_DB_VENDOR") == "postgresql" if is_postgresql: print("running tests on PostgreSQL") else: os.environ["LAMINDB_TEST_DB_VENDOR"] = "sqlite" print("running tests on SQLite") if is_postgresql is False: ln.setup.init( storage="./default_storage_unit_core", modules="bionty", name="lamindb-unit-tests-core", ) else: try: pgurl = setup_local_test_postgres() except RuntimeError: run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL) # noqa: S602 pgurl = setup_local_test_postgres() ln.setup.init( storage="./default_storage_unit_core", modules="bionty", name="lamindb-unit-tests-core", db=pgurl, ) ln.settings.creation.artifact_silence_missing_run_warning = True total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") def pytest_sessionfinish(session: pytest.Session): logger.set_verbosity(1) shutil.rmtree("./default_storage_unit_core") ln.setup.delete("lamindb-unit-tests-core", force=True) del os.environ["LAMIN_TESTING"] if not os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite": run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL) # noqa: S602 @pytest.fixture def ccaplog(caplog) -> pytest.LogCaptureFixture: """Add caplog handler to our custom logger at session start.""" from lamin_utils._logger import logger logger.addHandler(caplog.handler) yield caplog logger.removeHandler(caplog.handler) @pytest.fixture( scope="function", params=[ # tuple of is_in_registered_storage, path, suffix, hash of test_dir (True, "./default_storage_unit_core/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"), (True, "./default_storage_unit_core/", "", "iGtHiFEBV3r1_TFovdQCgw"), (True, "./registered_storage/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"), (True, "./registered_storage/", "", "iGtHiFEBV3r1_TFovdQCgw"), (False, "./nonregistered_storage/", ".csv", "iGtHiFEBV3r1_TFovdQCgw"), (False, "./nonregistered_storage/", "", "iGtHiFEBV3r1_TFovdQCgw"), ], ) def get_test_filepaths(request): # -> Tuple[bool, Path, Path, Path, str] is_in_registered_storage: bool = request.param[0] root_dir: Path = Path(request.param[1]) suffix: str = request.param[2] hash_test_dir: str = request.param[3] if is_in_registered_storage: # ensure that it's actually registered if ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None: ln.Storage(root=root_dir.resolve().as_posix(), type="local").save() else: assert ( ln.Storage.filter(root=root_dir.resolve().as_posix()).one_or_none() is None ) test_dirpath = root_dir / "my_dir/" test_dirpath.mkdir(parents=True, exist_ok=True) # create a first file test_filepath0 = test_dirpath / f"my_file{suffix}" test_filepath0.write_text("0") # create a second, duplicated file test_filepath1 = test_dirpath / f"my_file1{suffix}" test_filepath1.write_text("0") # create a non-duplicated file test_filepath2 = test_dirpath / f"my_file2{suffix}" test_filepath2.write_text("1") # return a boolean indicating whether test filepath is in default storage # and the test filepath yield ( is_in_registered_storage, root_dir, test_dirpath, test_filepath0, suffix, hash_test_dir, ) shutil.rmtree(test_dirpath) @pytest.fixture(scope="function") def registered_storage_file_and_folder(): root_dir = Path("./registered_storage_suffix_fixture") storage_root = root_dir.resolve().as_posix() if ln.Storage.filter(root=storage_root).one_or_none() is None: ln.Storage(root=storage_root, type="local").save() test_dirpath = root_dir / "suffix_fixture_dir" test_dirpath.mkdir(parents=True, exist_ok=True) test_filepath = test_dirpath / "suffix_fixture_file.csv" test_filepath.write_text("a,b\n1,2\n") folder_path = root_dir / "suffix_fixture_folder" folder_path.mkdir(parents=True, exist_ok=True) (folder_path / "nested.txt").write_text("content") yield test_filepath, folder_path shutil.rmtree(test_dirpath, ignore_errors=True) shutil.rmtree(folder_path, ignore_errors=True) @pytest.fixture(scope="session") def example_dataframe(): return pd.DataFrame({"feat1": [1, 2], "feat2": [3, 4]}) @pytest.fixture(scope="session") def adata_file(): adata = ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) filepath = Path("adata_file.h5ad") adata.write(filepath) yield "adata_file.h5ad" filepath.unlink() @pytest.fixture(scope="session") def tsv_file(): filepath = Path("test.tsv") pd.DataFrame([1, 2]).to_csv(filepath, sep="\t") yield filepath filepath.unlink() @pytest.fixture(scope="session") def zip_file(): filepath = Path("test.zip") pd.DataFrame([1, 2]).to_csv(filepath, sep="\t") yield filepath filepath.unlink(missing_ok=True) @pytest.fixture(scope="session") def yaml_file(): filepath = Path("test.yaml") dct = {"a": 1, "b": 2} with open(filepath, "w") as f: yaml.dump(dct, f) yield filepath filepath.unlink() @pytest.fixture(scope="session") def fcs_file(): fcs_path = ln.examples.datasets.file_fcs_alpert19() yield fcs_path fcs_path.unlink() @pytest.fixture(scope="session") def mudata_file(get_small_mdata): filepath = Path("test.h5mu") get_small_mdata.write(filepath) yield filepath filepath.unlink() @pytest.fixture(scope="session") def spatialdata_file(get_small_sdata): filepath = Path("test.zarr") get_small_sdata.write(filepath) yield filepath shutil.rmtree(filepath) ================================================ FILE: tests/core/notebooks/basic-r-notebook.Rmd.cleaned.html ================================================
library(laminr)

db <- connect()
→ connected lamindb: laminlabs/lamindata
db$track("lOScuxDTDE0q0000")
→ loaded Transform('lOScuxDT'), started Run('GWpaTtUg') at 2024-12-01 17:49:18 UTC
db$finish()
MoreOUTPUT 
================================================ FILE: tests/core/notebooks/basic-r-notebook.Rmd.html ================================================ My exemplary R analysis

My exemplary R analysis

library(laminr)

db <- connect()
→ connected lamindb: laminlabs/lamindata
db$track("lOScuxDTDE0q0000")
→ loaded Transform('lOScuxDT'), started Run('GWpaTtUg') at 2024-12-01 17:49:18 UTC
db$finish()
MoreOUTPUT ! please hit SHORTCUT to save the notebook in your editor and re-run finish()
================================================ FILE: tests/core/notebooks/duplicate/with-title-initialized-consecutive-finish.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# My duplicated test notebook (consecutive) with `ln.finish()`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This has actually different content than the original one in the `notebooks/` folder." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "\n", "ln.track()" ] } ], "metadata": { "kernelspec": { "display_name": "py310", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: tests/core/notebooks/load_schema.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "0", "metadata": {}, "outputs": [], "source": [ "import lamindb as ln" ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "# this is a test case because we had an issue with path resolution at some point: https://github.com/laminlabs/lamindb/pull/3211\n", "valid_features = ln.examples.schemas.valid_features()" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "valid_features.delete(permanent=True)" ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: tests/core/notebooks/no-title.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "0", "metadata": {}, "source": [ "A notebook without title." ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "import lamindb as ln" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "# pass stem uid\n", "ln.track(\"123456789ABC\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": {}, "outputs": [], "source": [ "assert ln.context.transform.description is None\n", "assert ln.context.transform.key == \"no-title.ipynb\"" ] } ], "metadata": { "kernelspec": { "display_name": "py312", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "nbproject": { "id": "Irn3xQyQ40GU", "pypackage": { "nbproject": "0.0.7+2.g8521e30" }, "time_init": "2022-06-08T14:42:31.551211+00:00", "version": "0" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: tests/core/notebooks/with-title-initialized-consecutive-finish-not-last-cell.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# My test notebook (consecutive) with `ln.finish()` not in last cell" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# do not pass uid purposefully\n", "ln.track()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"my consecutive cell\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ln.finish(ignore_non_consecutive=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"my consecutive cell\")" ] } ], "metadata": { "kernelspec": { "display_name": "py39", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: tests/core/notebooks/with-title-initialized-consecutive-finish.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# My test notebook (consecutive) with `ln.finish()`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import lamindb as ln\n", "import pytest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with pytest.raises(ln.errors.InvalidArgument) as error:\n", " ln.track(\"ujPaFZ\")\n", "print(error.exconly())\n", "assert error.exconly().startswith(\n", " 'lamindb.errors.InvalidArgument: Please pass an auto-generated uid instead of \"ujPaFZ\". Resolve by running:'\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with uid passed\n", "ln.track(\"ujPaFZatnMLG0000\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"my consecutive cell\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"my consecutive cell\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ln.finish()" ] } ], "metadata": { "kernelspec": { "display_name": "py312", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: tests/core/scripts/duplicate1/script-to-test-versioning.py ================================================ import lamindb as ln ln.context.version = "1" ln.track("Ro1gl7n8YrdH0001") ================================================ FILE: tests/core/scripts/duplicate2/script-to-test-versioning.py ================================================ import lamindb as ln ln.context.version = "2" ln.track("Ro1gl7n8YrdH0002") assert ln.context.transform.version_tag == "2" ================================================ FILE: tests/core/scripts/duplicate3/script-to-test-versioning.py ================================================ import lamindb as ln ln.context.version = "3" ln.track("Ro1gl7n8YrdH0002") ================================================ FILE: tests/core/scripts/duplicate4/script-to-test-versioning.py ================================================ import lamindb as ln ln.track() ================================================ FILE: tests/core/scripts/duplicate5/script-to-test-versioning.py ================================================ import lamindb as ln # different from the one in duplicate4 ln.track() ln.finish() ================================================ FILE: tests/core/scripts/script-to-test-filename-change.py ================================================ import lamindb as ln ln.track("Ro1gl7n8YrdH0001") ================================================ FILE: tests/core/scripts/script-to-test-versioning.py ================================================ import lamindb as ln ln.context.version = "1" ln.track("Ro1gl7n8YrdH0000") ================================================ FILE: tests/core/test_artifact_anndata_with_curation.py ================================================ import lamindb as ln def test_create_anndata_with_curation(): adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") feature1 = ln.Feature(name="sample_note", dtype=str).save() # ingest the first time artifact = ln.Artifact.from_anndata( adata, key="examples/mini_immuno1.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs", ).save() # capture the obs_schema because we'll overwrite it obs_schema = artifact.features.slots["obs"] # define another feature so that upon re-ingestion, we track more than before # (this also tests non-trivial idempotency) feature2 = ln.Feature(name="treatment_time_h", dtype=int).save() artifact = ln.Artifact.from_anndata( adata, key="examples/mini_immuno1.h5ad", schema="ensembl_gene_ids_and_valid_features_in_obs", ).save() schemas = artifact.features.slots artifact.delete(permanent=True) for schema in schemas.values(): schema.delete(permanent=True) obs_schema.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) ================================================ FILE: tests/core/test_artifact_basics.py ================================================ """Artifact tests. Also see `test_artifact_folders.py` for tests of folder-like artifacts. """ # ruff: noqa: F811 import shutil import sys from pathlib import Path, PurePosixPath from types import ModuleType, SimpleNamespace from unittest.mock import patch import anndata as ad import h5py import lamindb as ln import lamindb_setup import mudata as md import pandas as pd import pytest import zarr from _dataset_fixtures import ( # noqa get_mini_csv, get_small_adata, get_small_mdata, get_small_sdata, ) from lamindb.core.loaders import load_fcs, load_to_memory, load_tsv from lamindb.core.storage.paths import ( AUTO_KEY_PREFIX, auto_storage_key_from_artifact_uid, check_path_is_child_of_root, delete_storage, ) from lamindb.errors import ( FieldValidationError, InvalidArgument, ) from lamindb.models.artifact import ( data_is_scversedatastructure, get_relative_path_to_directory, process_data, ) from lamindb_setup.core.upath import ( CloudPath, LocalPathClasses, UPath, extract_suffix_from_path, ) # how do we properly abstract out the default storage variable? # currently, we're only mocking it through `storage` as set in conftest.py ln.settings.verbosity = "success" @pytest.fixture def data(request): if request.param == "get_small_adata": return request.getfixturevalue("get_small_adata") else: return request.param # ------------------------------------------------------------------------------------- # Basic construction # ------------------------------------------------------------------------------------- def test_basic_validation(): # extra kwargs with pytest.raises(FieldValidationError): ln.Artifact("testpath.csv", description="test1b", extra_kwarg="extra") # > 1 args with pytest.raises(ValueError) as error: ln.Artifact("testpath.csv", "testpath.csv") assert error.exconly() == "ValueError: Only one non-keyword arg allowed: path" # AUTO_KEY_PREFIX in key with pytest.raises(ValueError) as error: ln.Artifact(".gitignore", key=".lamindb/test_df.parquet") assert ( error.exconly() == f"ValueError: Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`" ) # path that contains AUTO_KEY_PREFIX with pytest.raises(ValueError) as error: ln.Artifact(".lamindb/test_df.parquet", description="Test") assert ( error.exconly() == f"ValueError: Do not pass path inside the `{AUTO_KEY_PREFIX}` directory." ) @pytest.mark.parametrize("key_is_virtual", [True, False]) @pytest.mark.parametrize("key", [None, "my_new_dir/my_artifact.csv", "nosuffix"]) @pytest.mark.parametrize("description", [None, "my description"]) def test_create_from_path_file(get_test_filepaths, key_is_virtual, key, description): ln.settings.creation._artifact_use_virtual_keys = key_is_virtual is_in_registered_storage = get_test_filepaths[0] root_dir = get_test_filepaths[1] test_filepath = get_test_filepaths[3] suffix = get_test_filepaths[4] # path suffix if key is not None: key_suffix = extract_suffix_from_path( PurePosixPath(key), arg_name="key" ) # key suffix else: key_suffix = None # this tests if insufficient information is being provided if key is None and not is_in_registered_storage and description is None: # this can fail because ln.track() might set a global run context # in that case, the Artifact would have a run that's not None and the # error below wouldn't be thrown with pytest.raises(ValueError) as error: artifact = ln.Artifact(test_filepath, key=key, description=description) assert ( error.exconly() == "ValueError: Pass one of key, run or description as a parameter" ) return None elif key is not None and suffix != key_suffix: with pytest.raises(InvalidArgument) as error: artifact = ln.Artifact(test_filepath, key=key, description=description) assert error.exconly() == ( f"lamindb.errors.InvalidArgument: The passed path's suffix '{suffix}' must match the passed key's suffix '{key_suffix}'." ) return None elif key is not None and is_in_registered_storage: inferred_key = get_relative_path_to_directory( path=test_filepath, directory=root_dir ).as_posix() try: artifact = ln.Artifact(test_filepath, key=key, description=description) except InvalidArgument as error: assert str(error) == ( f"The path '{test_filepath}' is already in registered storage" f" '{root_dir.resolve().as_posix()}' with key '{inferred_key}'\nYou" f" passed conflicting key '{key}': please move the file before" " registering it." ) return None else: artifact = ln.Artifact(test_filepath, key=key, description=description) assert artifact._state.adding # make sure that this is a new file in the db assert ( artifact.description is None if description is None else artifact.description == description ) assert artifact.suffix == suffix assert artifact.n_files is None artifact.save() assert artifact.path.exists() # check get by path assert ln.Artifact.get(path=artifact.path) == artifact if key is None: assert ( artifact.key == f"my_dir/my_file{suffix}" if is_in_registered_storage else artifact.key is None ) if is_in_registered_storage: assert artifact.storage.root == root_dir.resolve().as_posix() assert artifact.path == test_filepath.resolve() else: assert artifact.storage.root == lamindb_setup.settings.storage.root_as_str assert ( artifact.path == lamindb_setup.settings.storage.root / f".lamindb/{artifact.uid}{suffix}" ) else: assert artifact.key == key assert artifact._key_is_virtual == key_is_virtual if is_in_registered_storage: # this would only hit if the key matches the correct key assert artifact.storage.root == root_dir.resolve().as_posix() assert ( artifact.path == root_dir / f"{key}{suffix}" == test_filepath.resolve() ) else: # file is moved into default storage if key_is_virtual: assert ( artifact.path == lamindb_setup.settings.storage.root / f".lamindb/{artifact.uid}{suffix}" ) else: assert artifact.path == lamindb_setup.settings.storage.root / key # only delete from storage if a file copy took place delete_from_storage = str(test_filepath.resolve()) != str(artifact.path) artifact.delete(permanent=True, storage=delete_from_storage) ln.settings.creation._artifact_use_virtual_keys = True @pytest.mark.parametrize("key_is_virtual", [True, False]) @pytest.mark.parametrize("key", [None, "my_new_file.tsv"]) def test_create_from_path_file_with_explicit_key_is_virtual( tsv_file, key_is_virtual, key ): artifact = ln.Artifact( tsv_file, description="test explicit key is virtual", key=key, _key_is_virtual=key_is_virtual, ) assert artifact.key == key assert artifact._key_is_virtual == key_is_virtual artifact.save() assert artifact.path.exists() root = lamindb_setup.settings.storage.root if not key_is_virtual and key is not None: assert artifact.path == root / key else: assert artifact.path == root / f".lamindb/{artifact.uid}.tsv" artifact.delete(permanent=True, storage=True) def test_create_from_empty_files_skips_hash_lookup(tmp_path): path_1 = tmp_path / "empty-1.txt" path_2 = tmp_path / "empty-2.txt" path_1.write_text("") path_2.write_text("") artifact_1 = ln.Artifact(path_1, key=f"{tmp_path.name}/empty-1.txt").save() artifact_2 = ln.Artifact(path_2, key=f"{tmp_path.name}/empty-2.txt") assert artifact_2.uid != artifact_1.uid assert artifact_2.key == f"{tmp_path.name}/empty-2.txt" assert artifact_2.hash == artifact_1.hash artifact_2.save() assert artifact_2.id != artifact_1.id artifact_2.delete(permanent=True) artifact_1.delete(permanent=True) @pytest.mark.parametrize("key", [None, "my_new_folder"]) def test_create_from_path_folder(get_test_filepaths, key): # get variables from fixture is_in_registered_storage = get_test_filepaths[0] test_dirpath = get_test_filepaths[2] hash_test_dir = get_test_filepaths[5] if key is None and not is_in_registered_storage: with pytest.raises(ValueError) as error: ln.Artifact(test_dirpath, key=key) assert error.exconly().startswith( "ValueError: Pass one of key, run or description as a parameter" ) return None artifact1 = ln.Artifact(test_dirpath, key=key) if key is not None and is_in_registered_storage: assert artifact1._real_key is not None # should fail because we are passing a path in an existing storage with a virtual key with pytest.raises(ValueError) as error: ln.Artifact(test_dirpath, key=key, _key_is_virtual=False) assert error.exconly().startswith( "ValueError: Passing a path in an existing storage with a virtual key and _key_is_virtual=False is incompatible." ) else: assert artifact1._real_key is None # check that passing _key_is_virtual=True is incompatible with a path in an existing storage without a virtual key if key is None and is_in_registered_storage: with pytest.raises(ValueError) as error: ln.Artifact(test_dirpath, key=key, _key_is_virtual=True) assert error.exconly().startswith( "ValueError: Passing a path in an existing storage without a virtual key and _key_is_virtual=True is incompatible." ) assert artifact1.n_files == 3 assert artifact1.hash == hash_test_dir assert artifact1._state.adding assert artifact1.description is None assert artifact1.path.exists() artifact1.save() # run tests on re-creating the Artifact artifact2 = ln.Artifact(test_dirpath, key=key, description="something") assert not artifact2._state.adding assert artifact1.id == artifact2.id assert artifact1.uid == artifact2.uid assert artifact1.storage == artifact2.storage assert artifact2.path.exists() assert artifact2.description == "something" # now put another file in the test directory # create a first file test_filepath_added = test_dirpath / "my_file_added.txt" test_filepath_added.write_text("2") artifact3 = ln.Artifact(test_dirpath, key=key, revises=artifact1) assert artifact3.n_files == 4 assert artifact3.hash != hash_test_dir assert artifact3._state.adding assert artifact3.description is None assert artifact3.path.exists() artifact3.save() # the state of artifact1 is lost, because artifact3 is stored at the same path assert artifact3.overwrite_versions assert artifact1.overwrite_versions assert artifact3.path == artifact1.path test_filepath_added.unlink() # delete the artifact artifact2.delete(permanent=True, storage=False) artifact3.delete(permanent=True, storage=False) def test_create_from_path_overwrite_versions_false(get_test_filepaths): # get variables from fixture is_in_registered_storage = get_test_filepaths[0] test_dirpath = get_test_filepaths[2] hash_test_dir = get_test_filepaths[5] if is_in_registered_storage: return artifact1 = ln.Artifact( test_dirpath, key="my_folder", overwrite_versions=False ).save() assert artifact1.hash == hash_test_dir # skip artifact2 because we already test this above # create a first file test_filepath_added = test_dirpath / "my_file_added.txt" test_filepath_added.write_text("2") artifact3 = ln.Artifact(test_dirpath, key="my_folder", overwrite_versions=False) assert artifact3.hash != hash_test_dir artifact3.save() # the state of artifact1 is lost, because artifact3 is stored at the same path assert not artifact3.overwrite_versions assert not artifact1.overwrite_versions assert artifact3.path != artifact1.path test_filepath_added.unlink() artifact1.delete(permanent=True, storage=False) artifact3.delete(permanent=True, storage=False) def test_delete_permanently_from_trash_folder(tmp_path): folder_path = tmp_path / "folder-overwrite-versions" folder_path.mkdir() (folder_path / "v1.txt").write_text("v1") key = f"{tmp_path.name}/folder-overwrite-versions" artifact = ln.Artifact(folder_path, key=key).save() assert artifact.overwrite_versions # First soft-delete (move to trash), then delete permanently. artifact.delete() artifact.refresh_from_db() assert artifact.branch_id == -1 with patch("builtins.input", return_value="y"): artifact.delete() assert ln.Artifact.objects.filter(uid__startswith=artifact.stem_uid).count() == 0 def test_create_from_path_set_branch(): branch = ln.Branch(name="contrib1").save() artifact1 = ln.Artifact(".gitignore", key="test", branch=branch).save() # check hash lookup on different branch artifact2 = ln.Artifact(".gitignore", key="test1") assert artifact1 == artifact2 # cleanup artifact1.delete(permanent=True) branch.delete(permanent=True) @pytest.mark.parametrize("key", [None, "my_new_folder"]) def test_from_dir(get_test_filepaths, key): is_in_registered_storage = get_test_filepaths[0] test_dirpath = get_test_filepaths[2] # the directory contains 3 files, two of them are duplicated artifacts = ln.Artifact.from_dir(test_dirpath, key=key) for artifact in artifacts: if key is not None and is_in_registered_storage: assert artifact._real_key is not None else: assert artifact._real_key is None # we only return the duplicated ones hashes = [artifact.hash for artifact in artifacts if artifact.hash is not None] uids = [artifact.uid for artifact in artifacts] assert len(set(hashes)) == len(hashes) ln.UPath(test_dirpath).view_tree() # now save artifacts.save() # now run again, because now we'll have hash-based lookup! artifacts = ln.Artifact.from_dir(test_dirpath, key=key) assert len(artifacts) == 2 assert len(set(artifacts)) == len(hashes) queried_artifacts = ln.Artifact.filter(uid__in=uids) for artifact in queried_artifacts: artifact.delete(permanent=True, storage=False) def test_create_from_dataframe(example_dataframe: pd.DataFrame): df = example_dataframe artifact = ln.Artifact.from_dataframe(df, description="test1") assert artifact.description == "test1" assert artifact.key is None assert artifact.otype == "DataFrame" assert artifact.kind == "dataset" assert artifact.n_observations == 2 assert hasattr(artifact, "_local_filepath") artifact.key = "my-test-dataset" # try changing key with pytest.raises(ln.errors.InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'." ) artifact.key = None # restore artifact.suffix = ".whatever" # changing suffix before first save is invalid with pytest.raises( ln.errors.InvalidArgument, match="Cannot update the suffix of an artifact before it is saved.", ): artifact.save() artifact.suffix = ".parquet" artifact.save() # check that the local filepath has been cleared assert not hasattr(artifact, "_local_filepath") del artifact # now get an artifact from the database artifact = ln.Artifact.get(description="test1") parquet_path = artifact.path assert parquet_path.exists() assert parquet_path.suffix == ".parquet" # test cancelling the move artifact.suffix = ".whatever" with patch("builtins.input", return_value="n"): assert artifact.save() is None assert parquet_path.exists() artifact = ln.Artifact.get(description="test1") assert artifact.suffix == ".parquet" artifact.suffix = ".whatever" with patch("builtins.input", return_value="y"): artifact.save() assert artifact.suffix == ".whatever" whatever_path = artifact.path assert whatever_path.exists() assert whatever_path.suffix == ".whatever" assert not parquet_path.exists() artifact.suffix = ".parquet" with patch("builtins.input", return_value="y"): artifact.save() assert artifact.suffix == ".parquet" parquet_path_restored = artifact.path assert parquet_path_restored.exists() assert parquet_path_restored.suffix == ".parquet" assert not whatever_path.exists() # coming from `key is None` that setting a key with different suffix is not allowed artifact.key = "my-test-dataset.suffix" with pytest.raises(ln.errors.InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: The suffix '.suffix' of the provided key is incorrect, it should be '.parquet'." ) # coming from `key is None` test with no suffix artifact.key = "my-test-dataset" with pytest.raises(ln.errors.InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'." ) # virtual key and suffix can now be updated together artifact.key = "my-test-dataset" artifact.suffix = "" with patch("builtins.input", return_value="y"): artifact.save() assert artifact.suffix == "" assert artifact.key == "my-test-dataset" # changing the suffix updates the key suffix as well artifact.suffix = ".parquet" with patch("builtins.input", return_value="y"): artifact.save() assert artifact.key == "my-test-dataset.parquet" # coming from a .parquet key, test changing the key to no suffix artifact.key = "my-test-dataset" with pytest.raises(ln.errors.InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: The suffix '' of the provided key is incorrect, it should be '.parquet'." ) artifact.delete(permanent=True) # test from_dataframe with a path path = Path("test_df_from_path.parquet") try: example_dataframe.to_parquet(path) for path_input in [path, str(path)]: artifact = ln.Artifact.from_dataframe( path_input, description="test from path" ) assert artifact.description == "test from path" assert artifact.otype == "DataFrame" assert artifact.kind == "dataset" assert artifact.n_observations == 2 artifact.save() artifact.delete(permanent=True) finally: path.unlink(missing_ok=True) def test_dataframe_validate_suffix(example_dataframe: pd.DataFrame): df = example_dataframe artifact = ln.Artifact.from_dataframe(df, key="test_.parquet") assert artifact.suffix == ".parquet" with pytest.raises(ln.errors.InvalidArgument) as error: artifact = ln.Artifact.from_dataframe(df, key="test_.def") assert ( error.exconly().partition(",")[0] == "lamindb.errors.InvalidArgument: The passed key's suffix '.def' must match the passed path's suffix '.parquet'." ) def test_create_from_parquet_file_default_constructor( example_dataframe: pd.DataFrame, ccaplog: pytest.LogCaptureFixture ): path = "test_df.parquet" example_dataframe.to_parquet(path) ln.Artifact(path, key=path) assert "data is a DataFrame, please use .from_dataframe()" in ccaplog.text Path(path).unlink() def test_create_from_anndata(get_small_adata, adata_file, example_dataframe): with pytest.raises(ValueError) as error: ln.Artifact.from_anndata(example_dataframe, description="test1") assert ( "data has to be an AnnData object or a path to AnnData-like" in error.exconly() ) for i, _a in enumerate([get_small_adata, adata_file]): artifact = ln.Artifact.from_anndata(_a, description="test1") assert artifact.description == "test1" assert artifact.key is None assert artifact.otype == "AnnData" assert artifact.kind == "dataset" assert artifact.n_observations == 2 if i == 0: assert hasattr(artifact, "_local_filepath") artifact.save() # check that the local filepath has been cleared assert not hasattr(artifact, "_local_filepath") artifact.delete(permanent=True) def test_from_anndata_uses_h5ad_kwargs(get_small_adata): artifact = ln.Artifact.from_anndata( get_small_adata, key="test_kwargs.h5ad", h5ad_kwargs={"compression": "gzip"}, ) local_path = artifact._local_filepath with h5py.File(local_path, mode="r") as store: assert store["X"].compression == "gzip" local_path.unlink(missing_ok=True) def test_from_anndata_uses_zarr_kwargs(get_small_adata): chunks = (1, get_small_adata.n_vars) artifact = ln.Artifact.from_anndata( get_small_adata, key="test_kwargs.zarr", format="zarr", zarr_kwargs={"chunks": chunks}, ) local_path = artifact._local_filepath assert zarr.open(local_path, mode="r")["X"].chunks == chunks shutil.rmtree(local_path) def test_from_anndata_validate_suffix(get_small_adata): artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.h5ad") assert artifact.suffix == ".h5ad" artifact = ln.Artifact.from_anndata( get_small_adata, format="h5ad", key="test_.h5ad" ) assert artifact.suffix == ".h5ad" artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.zarr") assert artifact.suffix == ".zarr" with pytest.raises(ValueError) as error: artifact = ln.Artifact.from_anndata(get_small_adata, key="test_.def") assert ( error.exconly().partition(",")[0] == "ValueError: Error when specifying AnnData storage format" ) with pytest.raises(InvalidArgument) as error: artifact = ln.Artifact.from_anndata(get_small_adata, key="test_") assert ( error.exconly().partition(",")[0] == "lamindb.errors.InvalidArgument: The passed key's suffix '' must match the passed path's suffix '.h5ad'." ) def test_create_from_mudata(get_small_mdata, mudata_file, adata_file): with pytest.raises(ValueError) as error: ln.Artifact.from_mudata(adata_file, description="test1") assert "data has to be a MuData object or a path to MuData-like" in error.exconly() for m in [get_small_mdata, mudata_file]: af = ln.Artifact.from_mudata(m, description="test1") assert af.description == "test1" assert af.key is None assert af.otype == "MuData" assert af.kind == "dataset" if isinstance(m, md.MuData): assert af.n_observations == 2 def test_create_from_spatialdata( get_small_sdata, spatialdata_file, adata_file, ccaplog ): with pytest.raises(ValueError) as error: ln.Artifact.from_spatialdata(adata_file, description="test1") assert ( "data has to be a SpatialData object or a path to SpatialData-like" in error.exconly() ) for s in [get_small_sdata, spatialdata_file]: af = ln.Artifact(s, description="test1") assert af.description == "test1" assert af.key is None assert af.otype == "SpatialData" assert af.kind is None # n_observations not defined assert "data is a SpatialData, please use .from_spatialdata()" in ccaplog.text for s in [get_small_sdata, spatialdata_file]: af = ln.Artifact.from_spatialdata(s, description="test1") assert af.description == "test1" assert af.key is None assert af.otype == "SpatialData" assert af.kind == "dataset" # n_observations not defined @pytest.mark.parametrize( "data", ["get_small_adata"], indirect=True, ) def test_create_from_anndata_in_storage(data): artifact = ln.Artifact.from_anndata( data, description="test_create_from_anndata_memory" ) assert artifact.n_observations == data.n_obs assert artifact.otype == "AnnData" assert hasattr(artifact, "_local_filepath") artifact.save() # check that the local filepath has been cleared assert not hasattr(artifact, "_local_filepath") # ------------------------------------------------------------------------------------- # Life cycle management # ------------------------------------------------------------------------------------- def test_revise_recreate_artifact(example_dataframe: pd.DataFrame, ccaplog): df = example_dataframe # attempt to create a file with an invalid version with pytest.raises(ValueError) as error: artifact = ln.Artifact.from_dataframe(df, description="test", version=0) assert ( error.exconly() == "ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1'," " '2', etc." ) # create a file and tag it with a version key = "my-test-dataset.parquet" artifact = ln.Artifact.from_dataframe(df, key=key, description="test", version="1") assert artifact.version_tag == "1" assert artifact.version == "1" assert artifact.uid.endswith("0000") assert artifact.path.exists() # because of cache file already exists artifact.save() assert artifact.path.exists() assert artifact.suffix == ".parquet" with pytest.raises(ValueError) as error: artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact, version="1") assert ( error.exconly() == "ValueError: Please change the version tag or leave it `None`, '1' is already taken" ) # create new file from old file df.iloc[0, 0] = 99 # mutate dataframe so that hash lookup doesn't trigger artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact) assert artifact_v2.stem_uid == artifact.stem_uid assert artifact_v2.uid.endswith("0001") # call this again artifact_v2 = ln.Artifact.from_dataframe(df, revises=artifact) assert artifact_v2.uid.endswith("0001") assert artifact_v2.stem_uid == artifact.stem_uid assert artifact_v2.version_tag is None assert ( artifact_v2.version == artifact_v2.uid[-4:] ) # version falls back to uid suffix assert artifact_v2.key == key assert artifact.suffix == ".parquet" assert artifact_v2.description == "test" assert artifact_v2._revises is not None artifact_v2.save() assert artifact_v2.path.exists() assert artifact_v2._revises is None # revise by providing `revises` argument (do not save) df.iloc[0, 0] = 0 # mutate dataframe so that hash lookup doesn't trigger artifact_v3 = ln.Artifact.from_dataframe( df, description="test1", revises=artifact_v2, version="2" ) assert artifact_v3.uid.endswith("0002") assert artifact_v3.stem_uid == artifact.stem_uid assert artifact_v3.version_tag == "2" assert artifact_v3.version == "2" assert artifact_v3.description == "test1" assert artifact_v3.key == key # revise by matching on `key` (do not save) artifact_v3 = ln.Artifact.from_dataframe( df, key=key, description="test1", version="2" ) assert artifact_v3.uid.endswith("0002") assert artifact_v3.stem_uid == artifact.stem_uid assert artifact_v3.key == key assert artifact_v3.version_tag == "2" assert artifact_v3.version == "2" assert artifact_v3.description == "test1" assert artifact_v3.is_latest assert artifact_v2.is_latest artifact_v3.save() # now r2 is no longer the latest version, but need to re-fresh from db artifact_v2.refresh_from_db() assert not artifact_v2.is_latest # re-create based on hash when artifact_v3 is in trash artifact_v3.delete() artifact_new = ln.Artifact.from_dataframe( df, key="my-test-dataset1.parquet", ) assert artifact_new != artifact_v3 assert artifact_new.hash == artifact_v3.hash assert artifact_new.key == "my-test-dataset1.parquet" artifact_v3.restore() # restore from trash # re-create based on hash while providing same key, previous version df.iloc[0, 0] = 99 # this is a previous version artifact_new = ln.Artifact.from_dataframe( df, key=key, ) assert artifact_new == artifact_v2 assert artifact_new.hash == artifact_v2.hash assert artifact_new.key == key assert artifact.is_latest is False # re-create based on hash while providing a different key df.iloc[0, 0] = 0 artifact_new = ln.Artifact.from_dataframe( df, key="my-test-dataset1.parquet", description="test1 updated", ) assert artifact_new == artifact_v3 assert artifact_new.hash == artifact_v3.hash assert artifact_new.key == key # old key assert artifact_new.description == "test1 updated" # re-create while skipping hash lookup with different key artifact_v4 = ln.Artifact.from_dataframe( df, key="my-test-dataset1.parquet", skip_hash_lookup=True, ) assert artifact_v4.uid != artifact_v3.uid assert artifact_v4.hash == artifact_v3.hash assert artifact_v4.key == "my-test-dataset1.parquet" artifact_v4.save() # this just saves a duplicated file # re-create while skipping hash lookup with same key artifact_new = ln.Artifact.from_dataframe( df, key="my-test-dataset1.parquet", skip_hash_lookup=True, ) assert artifact_new.uid != artifact_v4.uid assert artifact_new.stem_uid == artifact_v4.stem_uid assert artifact_new.hash == artifact_v4.hash artifact_new.save() # should now violate unique constraint, falls back artifact_v4 assert artifact_new.uid == artifact_v4.uid # re-create while skipping hash lookup artifact, move to trash before artifact_v4.delete() artifact_new = ln.Artifact.from_dataframe( df, key="my-test-dataset1.parquet", skip_hash_lookup=True, ) assert artifact_new.uid != artifact_v4.uid assert artifact_new.key == "my-test-dataset1.parquet" assert "returning artifact from trash" not in ccaplog.text artifact_new.save() # should now violate unique constraint, retrieve artifact_v4 from trash assert "returning artifact from trash" in ccaplog.text assert artifact_new.uid == artifact_v4.uid assert artifact_new.branch_id == 1 # restored to default branch with pytest.raises(TypeError) as error: ln.Artifact.from_dataframe( df, description="test1a", revises=ln.Record(name="test") ) assert error.exconly() == "TypeError: `revises` has to be of type `Artifact`" artifact_v3.delete(permanent=True) artifact_v2.delete(permanent=True) artifact.delete(permanent=True) # unversioned file artifact = ln.Artifact.from_dataframe(df, description="test2") assert artifact.version_tag is None assert artifact.version == artifact.uid[-4:] # version falls back to uid suffix # what happens if we don't save the old file? # add a test for it! artifact.save() # create new file from old file df.iloc[0, 0] = 101 # mutate dataframe so that hash lookup doesn't trigger new_artifact = ln.Artifact.from_dataframe(df, revises=artifact) assert artifact.version_tag is None assert artifact.version == artifact.uid[-4:] # version falls back to uid suffix assert new_artifact.stem_uid == artifact.stem_uid assert new_artifact.version_tag is None assert ( new_artifact.version == new_artifact.uid[-4:] ) # version falls back to uid suffix assert new_artifact.description == artifact.description new_artifact.save() assert new_artifact.is_latest assert "you are saving to a non-latest version of the artifact" not in ccaplog.text old_artifact = ln.Artifact.get(artifact.id) # to update is_latest from the db assert not old_artifact.is_latest old_artifact.description = "change old version description" old_artifact.save() assert "you are saving to a non-latest version of the artifact" in ccaplog.text old_artifact.delete() new_artifact.delete() artifact_from_trash = ln.Artifact.get(new_artifact.uid[:-4]) # query with stem uid assert artifact_from_trash.branch_id == -1 old_artifact.delete(permanent=True) new_artifact.delete(permanent=True) # check after cleanups assert ( ccaplog.text.count("you are saving to a non-latest version of the artifact") == 1 ) def test_delete_and_restore_artifact(example_dataframe: pd.DataFrame): df = example_dataframe artifact = ln.Artifact.from_dataframe( df, description="My test file to delete" ).save() assert artifact.branch_id == 1 assert artifact.key is None or artifact._key_is_virtual storage_path = artifact.path # trash behavior artifact.delete() assert storage_path.exists() assert artifact.branch_id == -1 assert ln.Artifact.filter(description="My test file to delete").first() is None assert ln.Artifact.filter( description="My test file to delete", branch__name="trash" ).first() # no implicit restore from trash, we're making a new artifact artifact_restored = ln.Artifact.from_dataframe( df, description="My test file to delete" ) assert artifact_restored.branch_id == 1 assert artifact_restored != artifact # permanent delete artifact.delete(permanent=True) assert ( ln.Artifact.filter(description="My test file to delete", branch_id=None).first() is None ) assert not storage_path.exists() # deletes from storage is key_is_virtual def test_delete_storage(): with pytest.raises(FileNotFoundError): delete_storage(ln.settings.storage.root / "test-delete-storage") def test_recreate_after_artifact_moved_in_storage(ccaplog): # this needs to be in a registered storage location Path("./default_storage_unit_core/test_file.txt").write_text("test content") artifact = ln.Artifact("./default_storage_unit_core/test_file.txt").save() # now rename the file within the storage location Path("./default_storage_unit_core/test_file.txt").rename( "./default_storage_unit_core/moved_file.txt" ) ln.Artifact("./default_storage_unit_core/moved_file.txt").save() assert "updating previous key" in ccaplog.text artifact.delete(permanent=True, storage=True) # ------------------------------------------------------------------------------------- # Storage # ------------------------------------------------------------------------------------- def test_move_artifact_exception_handling(): import lamindb.models.artifact as artifact_module class FakeFS: def __init__( self, copy_error: Exception | None = None, exists: bool = False, rm_error: Exception | None = None, ): self.copy_error = copy_error self._exists = exists self.rm_error = rm_error self.rm_calls = 0 def exists(self, path: str) -> bool: return self._exists def copy(self, source: str, target: str, recursive: bool = True): if self.copy_error is not None: raise self.copy_error def rm(self, path: str, recursive: bool = True): self.rm_calls += 1 if self.rm_error is not None: raise self.rm_error source_path = UPath("s3://lamindb-ci/source-artifact") storage = SimpleNamespace(path=UPath("s3://lamindb-ci"), id=42) # _rm_catch_error helper branches fs_missing = FakeFS(exists=False) assert ( artifact_module._rm_catch_error(fs_missing, "s3://lamindb-ci/missing") is None ) assert fs_missing.rm_calls == 0 fs_ok = FakeFS(exists=True) assert artifact_module._rm_catch_error(fs_ok, "s3://lamindb-ci/target") is None assert fs_ok.rm_calls == 1 rm_error = RuntimeError("rm failed") fs_fail = FakeFS(exists=True, rm_error=rm_error) returned_error = artifact_module._rm_catch_error(fs_fail, "s3://lamindb-ci/target") assert returned_error is rm_error assert fs_fail.rm_calls == 1 # copy branch: copy fails and cleanup helper is included in the message artifact_copy = SimpleNamespace(path=source_path, storage_id=None) with ( patch.object( artifact_module, "_s", return_value=SimpleNamespace( auto_storage_key_from_artifact=lambda _: "target-artifact" ), ), patch.object( artifact_module, "fs_for_moving", return_value=FakeFS(copy_error=ValueError("copy failed")), ), patch.object( artifact_module, "_rm_catch_error", return_value=RuntimeError("rm failed"), ) as rm_mock, ): with pytest.raises(RuntimeError, match="Failed to copy artifact"): artifact_module._move_artifact_to_storage(artifact_copy, storage) assert rm_mock.call_count == 1 # target exists branch: raises before attempting copy artifact_exists = SimpleNamespace(path=source_path, storage_id=None) with ( patch.object( artifact_module, "_s", return_value=SimpleNamespace( auto_storage_key_from_artifact=lambda _: "target-artifact" ), ), patch.object( artifact_module, "fs_for_moving", return_value=FakeFS(exists=True) ), ): with pytest.raises(FileExistsError, match="already exists"): artifact_module._move_artifact_to_storage(artifact_exists, storage) # same source and target path is rejected early artifact_same_path = SimpleNamespace(path=source_path, storage_id=None) with patch.object( artifact_module, "_s", return_value=SimpleNamespace( auto_storage_key_from_artifact=lambda _: "source-artifact" ), ): with pytest.raises(ValueError, match="Cannot move to the same path"): artifact_module._move_artifact_to_storage(artifact_same_path, storage) # verification branch: sorted sizes mismatch triggers cleanup helper artifact_mismatch = SimpleNamespace(path=source_path, storage_id=None) with ( patch.object( artifact_module, "_s", return_value=SimpleNamespace( auto_storage_key_from_artifact=lambda _: "target-artifact" ), ), patch.object(artifact_module, "fs_for_moving", return_value=FakeFS()), patch.object(artifact_module, "_sorted_sizes", side_effect=[[1], [2]]), patch.object( artifact_module, "_rm_catch_error", return_value=RuntimeError("rm failed"), ) as rm_mock, ): with pytest.raises(RuntimeError, match="Move verification failed"): artifact_module._move_artifact_to_storage(artifact_mismatch, storage) assert rm_mock.call_count == 1 # source-removal branch: move succeeds but rm(source) fails and is logged artifact_rm_fail = SimpleNamespace(path=source_path, storage_id=None) with ( patch.object( artifact_module, "_s", return_value=SimpleNamespace( auto_storage_key_from_artifact=lambda _: "target-artifact" ), ), patch.object( artifact_module, "fs_for_moving", return_value=FakeFS(rm_error=RuntimeError()), ), patch.object(artifact_module, "_sorted_sizes", side_effect=[[1], [1]]), patch.object(artifact_module.logger, "error") as logger_error_mock, ): artifact_module._move_artifact_to_storage(artifact_rm_fail, storage) assert artifact_rm_fail.storage_id == storage.id assert logger_error_mock.call_count == 1 @pytest.mark.parametrize("suffix", [".txt", "", None]) def test_auto_storage_key_from_artifact_uid(suffix): test_id = "abo389f" if suffix is None: with pytest.raises(AssertionError): auto_storage_key_from_artifact_uid(test_id, suffix, False) else: assert AUTO_KEY_PREFIX == ".lamindb/" storage_key = auto_storage_key_from_artifact_uid(test_id, suffix, False) assert storage_key == f"{AUTO_KEY_PREFIX}{test_id}{suffix}" def test_storage_root_upath_equivalence(): storage_root = UPath("s3://lamindb-ci") filepath = UPath("s3://lamindb-ci/test-data/Species.csv") assert filepath.parents[-1] == storage_root def test_get_relative_path_to_directory(): # upath on S3 upath_root = UPath("s3://lamindb-ci") upath_directory1 = UPath("s3://lamindb-ci/test-data") # no trailing slash upath_directory2 = UPath("s3://lamindb-ci/test-data/") # trailing slash upath_file = UPath("s3://lamindb-ci/test-data/test.csv") assert ( "test-data/test.csv" == get_relative_path_to_directory(upath_file, upath_root).as_posix() ) assert ( "test.csv" == get_relative_path_to_directory(upath_file, upath_directory1).as_posix() ) assert ( "test.csv" == get_relative_path_to_directory(upath_file, upath_directory2).as_posix() ) # local path root = Path("/lamindb-ci") upath = Path("/lamindb-ci/test-data/test.csv") assert ( "test-data/test.csv" == get_relative_path_to_directory(upath, directory=root).as_posix() ) local_upath_root = UPath(root.as_posix()) local_upath_file = UPath(upath.as_posix()) assert ( "test-data/test.csv" == get_relative_path_to_directory( local_upath_file, directory=local_upath_root ).as_posix() ) with pytest.raises(TypeError) as error: get_relative_path_to_directory(upath, directory=".") assert error.exconly() == "TypeError: Directory not of type Path or UPath" def test_check_path_is_child_of_root(): # str root = "s3://lamindb-ci" upath = "s3://lamindb-ci/test-data/test.csv" assert check_path_is_child_of_root(upath, root=root) # str different protocols root = "prot1://lamindb-ci" upath = "prot2://lamindb-ci/test-data/test.csv" assert not check_path_is_child_of_root(upath, root=root) # UPath root = UPath("s3://lamindb-ci") upath = UPath("s3://lamindb-ci/test-data/test.csv") assert check_path_is_child_of_root(upath, root=root) upath2 = UPath("s3://lamindb-setup/test-data/test.csv") assert not check_path_is_child_of_root(upath2, root=root) # local path root = Path("/lamindb-ci") path = Path("/lamindb-ci/test-data/test.csv") assert check_path_is_child_of_root(path, root=root) path = Path("/lamindb-other/test-data/test.csv") assert not check_path_is_child_of_root(path, root=root) # Local & UPath root = UPath("s3://lamindb-ci") path = Path("/lamindb-ci/test-data/test.csv") assert not check_path_is_child_of_root(path, root=root) # different storage_options upath = UPath("s3://lamindb-ci/test-data/test.csv", cache_regions=True) assert upath.storage_options != root.storage_options assert check_path_is_child_of_root(upath, root=root) # the second level root = UPath("s3://lamindb-ci/test-data/") upath = UPath("s3://lamindb-ci/test-data/test/test.csv") assert check_path_is_child_of_root(upath, root=root) upath2 = UPath("s3://lamindb-ci/test-data-1/test/test.csv") assert not check_path_is_child_of_root(upath2, root=root) # http assert check_path_is_child_of_root( "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md", root="https://raw.githubusercontent.com", ) # s3 with endpoint assert not check_path_is_child_of_root( "s3://bucket/key?endpoint_url=http://localhost:8000", root="s3://bucket/", ) assert not check_path_is_child_of_root( "s3://bucket/key/", root="s3://bucket/?endpoint_url=http://localhost:8000", ) assert check_path_is_child_of_root( "s3://bucket/key?endpoint_url=http://localhost:8000", root="s3://bucket?endpoint_url=http://localhost:8000", ) assert check_path_is_child_of_root( UPath("s3://bucket/key", endpoint_url="http://localhost:8000"), root="s3://bucket?endpoint_url=http://localhost:8000", ) def test_serialize_paths(): fp_str = ln.examples.datasets.anndata_file_pbmc68k_test().as_posix() fp_path = Path(fp_str) up_str = "s3://lamindb-ci/test-unknown-storage-in-core-tests/test.csv" up_upath = UPath(up_str) storage = ln.settings.storage.record using_key = None _, filepath, _, _, _ = process_data( "id", fp_str, None, None, storage, using_key, skip_existence_check=True ) assert isinstance(filepath, LocalPathClasses) _, filepath, _, _, _ = process_data( "id", fp_path, None, None, storage, using_key, skip_existence_check=True ) assert isinstance(filepath, LocalPathClasses) with pytest.raises(ln.errors.UnknownStorageLocation) as err: _, filepath, _, _, _ = process_data( "id", up_str, None, None, storage, using_key, skip_existence_check=True, ) assert f"Path {up_str} is not contained in any known storage" in err.exconly() storage = ln.Storage( root="s3://lamindb-ci/test-unknown-storage-in-core-tests" ).save() _, filepath, _, _, _ = process_data( "id", up_str, None, None, storage, using_key, skip_existence_check=True ) assert isinstance(filepath, CloudPath) _, filepath, _, _, _ = process_data( "id", up_upath, None, None, storage, using_key, skip_existence_check=True, ) assert isinstance(filepath, CloudPath) storage.delete() Path("pbmc68k_test.h5ad").unlink(missing_ok=True) # ------------------------------------------------------------------------------------- # Data structures in storage # ------------------------------------------------------------------------------------- def test_data_is_anndata_paths(): assert data_is_scversedatastructure("something.h5ad", "AnnData") assert data_is_scversedatastructure("something.anndata.zarr", "AnnData") assert data_is_scversedatastructure( "s3://somewhere/something.anndata.zarr", "AnnData" ) assert not data_is_scversedatastructure("s3://somewhere/something.zarr", "AnnData") def test_data_is_anndata_anndatacessor(get_small_adata): artifact = ln.Artifact(get_small_adata, key="test_adata.h5ad").save() with artifact.open(mode="r") as access: assert data_is_scversedatastructure(access, "AnnData") artifact.delete(permanent=True) def test_data_is_mudata_paths(): assert data_is_scversedatastructure("something.h5mu", "MuData") assert data_is_scversedatastructure("something.mudata.zarr", "MuData") def test_data_is_spatialdata_paths(): assert data_is_scversedatastructure("something.spatialdata.zarr", "SpatialData") @pytest.mark.parametrize( "data,data_type,expected", [ ("get_small_adata", "AnnData", True), ("get_small_mdata", "MuData", True), ("get_small_sdata", "SpatialData", True), ("get_small_adata", "MuData", False), ("get_small_mdata", "AnnData", False), ("get_small_sdata", "AnnData", False), ("get_small_adata", None, True), (pd.DataFrame(), "AnnData", False), (None, "AnnData", False), (None, None, False), ], ) def test_data_is_scversedatastructure(request, data, data_type, expected): if isinstance(data, str) and data.startswith("get_small_"): data = request.getfixturevalue(data) assert data_is_scversedatastructure(data, data_type) == expected # ------------------------------------------------------------------------------------- # Miscellaneous # ------------------------------------------------------------------------------------- def test_load_to_memory(tsv_file, zip_file, fcs_file, yaml_file): # tsv df = load_tsv(tsv_file) assert isinstance(df, pd.DataFrame) # fcs adata = load_fcs(str(fcs_file)) assert isinstance(adata, ad.AnnData) # error with pytest.raises(NotImplementedError): load_to_memory(zip_file) # check that it is a path assert isinstance(load_to_memory("./somefile.rds"), UPath) # yaml dct = load_to_memory(yaml_file) assert dct["a"] == 1 assert dct["b"] == 2 with pytest.raises(TypeError) as error: ln.Artifact(True) assert error.exconly() == "TypeError: data has to be a string, Path, UPath" def test_bulk_delete(): report_path = Path("report.html") report_path.write_text("a") environment_path = Path("environment.txt") environment_path.write_text("c") report = ln.Artifact(report_path, description="Report").save() report_path.unlink() report_path = report.path environment = ln.Artifact(environment_path, description="requirement.txt").save() environment_path.unlink() environment_path = environment.path ln.Artifact.filter(id__in=[environment.id, report.id]).delete() assert len(ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=1)) == 0 # the 2 artifacts are in trash now assert ( len( ln.Artifact.filter( id__in=[environment.id, report.id], branch_id=-1, ) ) == 2 ) ln.Artifact.filter(id__in=[environment.id, report.id], branch_id=-1).delete( permanent=True ) # now they're gone assert ( len( ln.Artifact.filter( id__in=[environment.id, report.id], branch_id=None, ) ) == 0 ) assert not report_path.exists() assert not environment_path.exists() @pytest.mark.parametrize("module_name", ["mudata", "spatialdata"]) def test_no_unnecessary_imports( example_dataframe: pd.DataFrame, module_name: str ) -> None: if module_name in sys.modules: del sys.modules[module_name] af = ln.Artifact.from_dataframe(example_dataframe, description="to delete").save() loaded_packages = [] for name, module in sys.modules.items(): if isinstance(module, ModuleType) and not name.startswith("_"): if "." not in name: loaded_packages.append(name) assert module_name not in sorted(loaded_packages) # Cleanup and restore imports to ensure that other tests still run smoothly af.delete(permanent=True) import mudata # noqa import spatialdata # noqa def test_artifact_get_tracking(example_dataframe: pd.DataFrame): artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save() transform = ln.Transform(key="test track artifact via get").save() run = ln.Run(transform).save() assert ( ln.Artifact.get(key="df.parquet", is_run_input=run) in run.input_artifacts.all() ) artifact.delete(permanent=True) transform.delete(permanent=True) def test_get_by_path(example_dataframe: pd.DataFrame): artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save() artifact_path = artifact.path assert ln.Artifact.get(path=artifact_path) == artifact assert ln.Artifact.filter().get(path=artifact_path.as_posix()) == artifact with pytest.raises(ln.errors.ObjectDoesNotExist): ln.Artifact.get(path="s3://bucket/folder/file.parquet") with pytest.raises(ValueError): ln.User.get(path="some/path") artifact.delete(permanent=True) path_str = "s3://lamindb-ci/test-data/test.csv" storage = ln.Storage(ln.UPath(path_str).parent).save() artifact = ln.Artifact(path_str, description="test get by path").save() assert not artifact._key_is_virtual assert artifact._real_key is None assert ln.Artifact.get(path=path_str) == artifact artifact.delete(permanent=True, storage=False) artifact = ln.Artifact(path_str, key="some_file.csv").save() assert artifact._key_is_virtual assert artifact._real_key.endswith("test.csv") assert ln.Artifact.get(path=path_str) == artifact artifact.delete(permanent=True, storage=False) storage.delete() def test_update_suffix_for_registered_storage_with_real_key( registered_storage_file_and_folder, ): test_filepath, folder_path = registered_storage_file_and_folder assert folder_path.exists() and folder_path.is_dir() artifact = ln.Artifact(test_filepath, key="my_file.csv").save() assert artifact._real_key is not None assert artifact.path.suffix == ".csv" source_path = artifact.path artifact.suffix = ".tsv" with patch("builtins.input", return_value="y"): artifact.save() target_path = artifact.path assert artifact.suffix == ".tsv" assert artifact.key is not None assert artifact.key.endswith(".tsv") assert artifact._real_key is not None assert artifact._real_key.endswith(".tsv") assert target_path.suffix == ".tsv" assert target_path.exists() assert not source_path.exists() artifact.delete(permanent=True, storage=False) def test_update_suffix_for_registered_storage_folder_artifact( registered_storage_file_and_folder, ): _, folder_path = registered_storage_file_and_folder artifact = ln.Artifact(folder_path, key="dataset").save() assert artifact._real_key is not None assert artifact.suffix == "" assert artifact.path.exists() assert artifact.path.is_dir() source_path = artifact.path artifact.suffix = ".zarr" with patch("builtins.input", return_value="y"): artifact.save() target_path = artifact.path assert artifact.suffix == ".zarr" assert artifact.key is not None assert artifact.key.endswith(".zarr") assert artifact._real_key is not None assert artifact._real_key.endswith(".zarr") assert target_path.exists() assert target_path.is_dir() assert target_path.suffix == ".zarr" assert not source_path.exists() artifact.delete(permanent=True, storage=False) def test_update_non_virtual_key_for_registered_storage_file( registered_storage_file_and_folder, ): test_filepath, _ = registered_storage_file_and_folder artifact = ln.Artifact(test_filepath).save() assert not artifact._key_is_virtual assert artifact._real_key is None assert artifact.key is not None source_path = artifact.path source_key = artifact.key target_key = ( PurePosixPath(source_key) .with_name("suffix_fixture_file_renamed.csv") .as_posix() ) artifact.key = target_key with patch("builtins.input", return_value="n"): assert artifact.save() is None assert source_path.exists() artifact = ln.Artifact.get(uid=artifact.uid) assert artifact.key == source_key artifact.key = target_key with patch("builtins.input", return_value="y"): artifact.save() target_path = artifact.path assert artifact.key == target_key assert target_path.exists() assert not source_path.exists() artifact.delete(permanent=True, storage=False) def test_update_non_virtual_key_for_registered_storage_file_invalid_suffix( registered_storage_file_and_folder, ): test_filepath, _ = registered_storage_file_and_folder artifact = ln.Artifact(test_filepath).save() assert artifact.key is not None artifact.key = PurePosixPath(artifact.key).with_suffix(".tsv").as_posix() with pytest.raises(InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: The suffix '.tsv' of the provided key is incorrect, it should be '.csv'." ) artifact.delete(permanent=True, storage=False) def test_update_key_to_none_raises_invalid_argument( registered_storage_file_and_folder, ): test_filepath, _ = registered_storage_file_and_folder artifact = ln.Artifact(test_filepath).save() artifact.key = None with pytest.raises(InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: Cannot update an artifact key to None." ) artifact.delete(permanent=True, storage=False) def test_update_non_virtual_key_before_save_raises_invalid_argument(tsv_file): artifact = ln.Artifact(tsv_file, key="before-save.tsv", _key_is_virtual=False) artifact.key = "after-edit.tsv" with pytest.raises(InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: Cannot update the key of an artifact before it is saved." ) def test_update_non_virtual_key_in_unmanaged_storage_raises_invalid_argument(): url = ( "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md" ) artifact = ln.Artifact(url, description="test unmanaged key update").save() assert not artifact._key_is_virtual artifact.key = "laminlabs/lamindb/refs/heads/main/README-renamed.md" with pytest.raises(InvalidArgument) as error: artifact.save() assert ( error.exconly() == "lamindb.errors.InvalidArgument: Cannot update a non-virtual key of an artifact in a storage location that is not managed by the current instance." ) artifact.delete(permanent=True, storage=False) def test_create_artifact_in_foreign_managed_storage_raises_value_error(tsv_file): storage = ln.settings.storage.record with ( patch.object(storage, "instance_uid", "_not_exists_"), pytest.raises( ValueError, match=( "Cannot create an artifact in a storage location that is not managed by the current instance." ), ), ): ln.Artifact(tsv_file, storage=storage) def test_save_url_with_virtual_key_and_unmanaged_suffix_update_error(): url = ( "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md" ) key = "folder/file.md" artifact = ln.Artifact(url, key=key).save() assert artifact._real_key == "laminlabs/lamindb/refs/heads/main/README.md" assert artifact.storage.instance_uid is None cache_path_str = artifact._cache_path.as_posix() assert not cache_path_str.startswith("http") assert cache_path_str.endswith(key) artifact.suffix = ".txt" with pytest.raises( InvalidArgument, match=( "Cannot update the suffix of an artifact in a storage location " "that is not managed by the current instance." ), ): artifact.save() artifact.delete(permanent=True, storage=False) def test_change_space_for_artifact_in_foreign_managed_storage_raises_value_error( tsv_file, ): artifact = ln.Artifact(tsv_file, key="space-change-foreign-storage.tsv").save() space = ln.Space( name="test space change in foreign storage", uid="foreignspace" ).save() artifact.space = space with ( patch.object(artifact.storage, "instance_uid", "_not_exists_"), pytest.raises( ValueError, match=( "Cannot change the space of an artifact in a storage location that is not managed by the current instance." ), ), ): artifact.save() artifact.delete(permanent=True) space.delete(permanent=True) def test_save_artifact_to_foreign_managed_storage_raises_value_error(tsv_file): artifact = ln.Artifact(tsv_file, key="save-foreign-storage.tsv") with ( patch.object(artifact.storage, "instance_uid", "_not_exists_"), pytest.raises( ValueError, match=( "Cannot save an artifact to a storage location that is not managed by the current instance." ), ), ): artifact.save() def test_artifact_space_change(tsv_file): artifact = ln.Artifact(tsv_file, key="test_space_change.tsv").save() space = ln.Space(name="test space change", uid="00000234").save() # test after saving artifact.space = space with pytest.raises(ValueError) as err: artifact.save() assert ( "No local storage locations managed by the current instance found for the space" in err.exconly() ) # test after getting from the db artifact = ln.Artifact.get(key="test_space_change.tsv") artifact.space = space with pytest.raises(ValueError) as err: artifact.save() assert ( "No local storage locations managed by the current instance found for the space" in err.exconly() ) artifact.delete(permanent=True) space.delete(permanent=True) def test_passing_foreign_keys_ids(tsv_file): transform = ln.Transform(key="test passings foreign keys ids").save() first_run = ln.Run(transform).save() second_run = ln.Run(transform).save() # check that passing a wrong type errors with pytest.raises(AssertionError): ln.Artifact(tsv_file, space=transform) with pytest.raises(ValueError) as err: ln.Artifact(tsv_file, run=first_run, run_id=first_run.id) assert "Do not pass both Run and its id at the same time." in err.exconly() artifact = ln.Artifact(tsv_file, run=first_run, key="test_fk.tsv").save() artifact_id = artifact.id assert artifact.run == first_run artifact = ln.Artifact(tsv_file, run_id=second_run.id) # same hash assert artifact.id == artifact_id assert artifact._subsequent_run_id == second_run.id assert second_run in artifact.recreating_runs.all() # Run-side: output_artifacts vs recreated_artifacts assert list(first_run.output_artifacts.all()) == [artifact] assert list(first_run.recreated_artifacts.all()) == [] assert list(second_run.output_artifacts.all()) == [] assert list(second_run.recreated_artifacts.all()) == [artifact] # query_output_artifacts assert list(first_run.query_output_artifacts(include_recreated=False)) == [artifact] assert list(first_run.query_output_artifacts(include_recreated=True)) == [artifact] assert list(second_run.query_output_artifacts(include_recreated=False)) == [] assert list(second_run.query_output_artifacts(include_recreated=True)) == [artifact] artifact.delete(permanent=True) second_run.delete(permanent=True) first_run.delete(permanent=True) transform.delete(permanent=True) ================================================ FILE: tests/core/test_artifact_dataframe_with_curation.py ================================================ # Note: Almost all logic for schema-based validation is handled in the curators test suite # This here only covers external feature annotation and validation import lamindb as ln import pandas as pd import pytest @pytest.fixture(scope="module") def two_internal_features(): feat1 = ln.Feature(name="feat1", dtype=int).save() feat2 = ln.Feature(name="feat2", dtype=int).save() yield feat1, feat2 feat1.delete(permanent=True) feat2.delete(permanent=True) @pytest.fixture(scope="module") def two_external_features(): feature_a = ln.Feature(name="feature_a", dtype=str).save() feature_b = ln.Feature(name="feature_b", dtype=str).save() yield feature_a, feature_b feature_a.delete(permanent=True) feature_b.delete(permanent=True) @pytest.mark.parametrize("use_schema", [True, False]) def test_create_artifact_with_external_feature_annotations( use_schema: bool, two_external_features: tuple[ln.Feature, ln.Feature], ): feat1, feat2 = two_external_features if use_schema: schema = ln.Schema(features=[feat1, feat2]).save() else: schema = None artifact = ln.Artifact( ".gitignore", key="test_file", features={"feature_a": "x", "feature_b": "y"}, schema=schema, ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} assert artifact.schema == schema # repeat to check idempotency (requires set_values() instead of add_values()) artifact = ln.Artifact( ".gitignore", key="test_file", features={"feature_a": "x", "feature_b": "y"}, schema=schema, ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} assert artifact.schema == schema if use_schema: with pytest.raises(ValueError) as error: artifact.features.remove_values("feature_a", value="x") assert ( "Cannot remove values if artifact has external schema." in error.exconly() ) else: artifact.features.remove_values("feature_a", value="x") assert artifact.features.get_values() == {"feature_b": "y"} artifact.delete(permanent=True) if use_schema: schema.delete(permanent=True) def test_artifact_from_dataframe_with_schema(example_dataframe: pd.DataFrame): df = example_dataframe feat1 = ln.Feature(name="feat1", dtype=int).save() artifact = ln.Artifact.from_dataframe( df, key="test_df.parquet", schema="valid_features" ).save() # repeat to check idempotency artifact = ln.Artifact.from_dataframe( df, key="test_df.parquet", schema="valid_features" ).save() assert artifact.schema == ln.examples.schemas.valid_features() assert artifact.features.get_values() == {} assert ( artifact.features.describe(return_str=True) == """\ Artifact: test_df.parquet (0000) └── Dataset features └── columns (1) feat1 int""" ) inferred_schema_link = artifact.schemas.through.get(artifact_id=artifact.id) assert inferred_schema_link.slot == "columns" assert inferred_schema_link.schema.members.count() == 1 assert inferred_schema_link.schema.members.first() == feat1 inferred_schema = inferred_schema_link.schema inferred_schema_link.delete() inferred_schema.delete(permanent=True) feat1.delete(permanent=True) artifact.delete(permanent=True) def test_artifact_dataframe_with_features(example_dataframe: pd.DataFrame): """Test column names encoding when features with the same names are present.""" artifact = ln.Artifact.from_dataframe(example_dataframe, key="df.parquet").save() id_feature = ln.Feature(name="id", dtype=int).save() uid_feature = ln.Feature(name="uid", dtype=str).save() artifact.features.add_values({"id": 1, "uid": "test-uid"}) df = ln.Artifact.filter(key="df.parquet").to_dataframe( include=["description"], features=True ) assert df.index.name == "__lamindb_artifact_id__" assert df.columns.tolist() == [ "__lamindb_artifact_uid__", "key", "id", "uid", "description", ] assert df.iloc[0]["id"] == 1 assert df.iloc[0]["uid"] == "test-uid" artifact.delete(permanent=True) id_feature.delete(permanent=True) uid_feature.delete(permanent=True) def test_from_dataframe_with_external_schema( example_dataframe: pd.DataFrame, two_external_features: tuple[ln.Feature, ln.Feature], two_internal_features: tuple[ln.Feature, ln.Feature], ): df = example_dataframe feat1, feat2 = two_internal_features featA, featB = two_external_features schema_external = ln.Schema(features=[featA, featB]).save() # Case 1: wrong internal features for this dataframe schema_with_mistake = ln.Schema( features=[featA, featB], slots={"__external__": schema_external}, otype="DataFrame", ).save() with pytest.raises(ln.errors.ValidationError) as error: artifact = ln.Artifact.from_dataframe( df, key="test_df_with_external_features.parquet", features={"feature_a": "x", "feature_b": "y"}, schema=schema_with_mistake, ).save() assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly() # alternative via DataFrameCurator directly with pytest.raises(ln.errors.ValidationError) as error: ln.curators.DataFrameCurator( df, schema=schema_with_mistake, ).validate() assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly() # Case 2: no schema for external features provided schema_no_external = ln.Schema(features=[feat1, feat2]).save() artifact = ln.Artifact.from_dataframe( df, key="test_df_with_external_features.parquet", features={"feature_a": "x", "feature_b": "y"}, schema=schema_no_external, ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} artifact.delete(permanent=True) # alternative via DataFrameCurator directly curator = ln.curators.DataFrameCurator( df, schema=schema_no_external, features={"feature_a": "x", "feature_b": "y"}, ) artifact = curator.save_artifact( key="test_df_with_external_features.parquet", ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} artifact.delete(permanent=True) # Case 3: correct external schema schema_correct_external = ln.Schema( features=[feat1, feat2], slots={"__external__": schema_external}, otype="DataFrame", ).save() # Case 3a: user passes no external features with pytest.raises(ln.errors.ValidationError) as error: artifact = ln.Artifact.from_dataframe( df, key="test_df_with_external_features.parquet", schema=schema_correct_external, ).save() assert ( "External features slot is defined in schema but no external features were provided." in error.exconly() ) # alternative via DataFrameCurator directly with pytest.raises(ln.errors.ValidationError) as error: curator = ln.curators.DataFrameCurator( df, schema=schema_correct_external, ) artifact = curator.save_artifact( key="test_df_with_external_features.parquet", ).save() assert ( "External features slot is defined in schema but no external features were provided." in error.exconly() ) # Case 3b: user provides external features artifact = ln.Artifact.from_dataframe( df, key="test_df_with_external_features.parquet", features={"feature_a": "x", "feature_b": "y"}, schema=schema_correct_external, ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} assert ( artifact.features.describe(return_str=True) == """\ Artifact: test_df_with_external_features.parquet (0000) ├── Dataset features │ └── columns (2) │ feat1 int │ feat2 int └── External features └── feature_a str x feature_b str y""" ) with pytest.raises(ValueError) as error: artifact.features.remove_values("feature_a", value="x") assert "Cannot remove values if artifact has external schema." in error.exconly() artifact.delete(permanent=True) # alternative via DataFrameCurator directly curator = ln.curators.DataFrameCurator( df, schema=schema_correct_external, features={"feature_a": "x", "feature_b": "y"}, ) artifact = curator.save_artifact( key="test_df_with_external_features.parquet", ).save() assert artifact.features.get_values() == {"feature_a": "x", "feature_b": "y"} # call this again to check calling with an existing artifact curator = ln.curators.DataFrameCurator( artifact, schema=schema_correct_external, features={"feature_a": "z", "feature_b": "y"}, ) artifact = curator.save_artifact( key="test_df_with_external_features.parquet", ).save() assert artifact.features.get_values() == {"feature_a": "z", "feature_b": "y"} # call this again without passing features explicitly (they're already part of the artifact) curator = ln.curators.DataFrameCurator( artifact, schema=schema_correct_external, ) artifact = curator.save_artifact( key="test_df_with_external_features.parquet", ).save() assert artifact.features.get_values() == {"feature_a": "z", "feature_b": "y"} # clean up everything inferred_schema = artifact.schemas.all()[0] artifact.schemas.remove(inferred_schema.id) inferred_schema.delete(permanent=True) artifact.delete(permanent=True) schema_with_mistake.delete(permanent=True) schema_no_external.delete(permanent=True) schema_correct_external.delete(permanent=True) schema_external.delete(permanent=True) ================================================ FILE: tests/core/test_artifact_describe_to_dataframe.py ================================================ from datetime import date import bionty as bt import lamindb as ln import numpy as np import pandas as pd import pytest from lamindb.models._describe import describe_postgres, describe_sqlite def _check_df_equality(actual_df: pd.DataFrame, expected_df: pd.DataFrame) -> bool: """Checks equality between two DataFrames. Special handling for columns containing sets and NaN values. """ # do not test indices by default # pd.testing.assert_index_equal(actual_df.index, expected_df.index) expected_df.index = actual_df.index assert set(actual_df.columns) == set(expected_df.columns) for col in expected_df.columns: # Detect if column contains sets by checking first non-null value first_value = next((v for v in expected_df[col] if pd.notna(v)), None) is_set_column = isinstance(first_value, set) if is_set_column: # For set columns, compare sets with NaN handling for idx in expected_df.index: actual_val = actual_df.loc[idx, col] expected_val = expected_df.loc[idx, col] # If both are NaN, they're equal if pd.isna(actual_val) and pd.isna(expected_val): continue # If one is NaN and the other isn't, they're not equal if pd.isna(actual_val) != pd.isna(expected_val): raise AssertionError(f"NaN mismatch at index {idx} in column {col}") # If neither is NaN, compare the sets assert actual_val == expected_val, ( f"Set mismatch at index {idx} in column {col}" ) else: pd.testing.assert_series_equal( actual_df[col], expected_df[col], check_names=False, # ignore series names ) return True # parallels the `registries` guide # please also see the test_querset.py tests def test_describe_to_dataframe_example_dataset(): ln.examples.datasets.mini_immuno.save_mini_immuno_datasets() artifact = ln.Artifact.get(key="examples/dataset1.h5ad") artifact2 = ln.Artifact.get(key="examples/dataset2.h5ad") with pytest.raises(ValueError) as error: artifact.features.remove_values("cell_type_by_expert") assert "Cannot remove values for dataset features." in error.exconly() # Test df(include=[...]) df = ( ln.Artifact.filter(key__startswith="examples/dataset", suffix=".h5ad") .order_by("-key") .to_dataframe(include=["schemas__hash", "schemas__name"]) .drop(["uid"], axis=1) ) expected_data = { "key": ["examples/dataset2.h5ad", "examples/dataset1.h5ad"], "schemas__hash": [ set(artifact2.schemas.all().values_list("hash", flat=True)), set(artifact.schemas.all().values_list("hash", flat=True)), ], "schemas__name": [{None}, {None}], } expected_df = pd.DataFrame(expected_data) _check_df_equality(df, expected_df) # Test df with features # test that the records filter DOES NOT affect joining the annotations # we want it to only affect the artifact query (even though here, it won't change the result as both artifacts have the IFNG label) df = ( ln.Artifact.filter( key__startswith="examples/dataset", suffix=".h5ad", records__name="IFNG", ) .order_by("-key") .to_dataframe( features=[ "cell_type_by_expert", "cell_type_by_model", "experiment", "perturbation", "temperature", "study_note", "date_of_study", ] ) .drop(["uid"], axis=1) ) expected_data = { "key": ["examples/dataset2.h5ad", "examples/dataset1.h5ad"], "cell_type_by_expert": [np.nan, {"CD8-positive, alpha-beta T cell", "B cell"}], "cell_type_by_model": [{"T cell", "B cell"}, {"T cell", "B cell"}], "experiment": pd.Categorical(["Experiment 2", "Experiment 1"]), "perturbation": [{"IFNG", "DMSO"}, {"IFNG", "DMSO"}], "temperature": [22.6, 21.6], "study_note": [ np.nan, "We had a great time performing this study and the results look compelling.", ], "date_of_study": [date(2025, 2, 13), date(2024, 12, 1)], "study_metadata": [ {"detail1": "456", "detail2": 2}, {"detail1": "123", "detail2": 1}, ], } expected_df = pd.DataFrame(expected_data) _check_df_equality(df, expected_df) # Test filtering artifacts by schemas__in (alternative approach) # Query artifacts that measure CD8A gene by filtering schemas first cd8a = bt.Gene.get(symbol="CD8A") schemas_with_cd8a = ln.Schema.filter(genes=cd8a) df = ln.Artifact.filter(schemas__in=schemas_with_cd8a).to_dataframe() assert set(df["key"]) == {"examples/dataset2.h5ad", "examples/dataset1.h5ad"} # check backward compat query with deprecation warning with pytest.warns( DeprecationWarning, match="Querying Artifact by `feature_sets` is deprecated" ): df = ln.Artifact.filter(feature_sets__in=schemas_with_cd8a).to_dataframe() assert set(df["key"]) == {"examples/dataset2.h5ad", "examples/dataset1.h5ad"} # expected output has italicized elements that can't be tested # hence testing is restricted to section content, not headings output = artifact.describe(return_str=True) assert "hash:" in output assert "size:" in output assert "schema:" in output assert "n_observations: 3" in output assert "storage/path:" in output assert "created_by:" in output assert "created_at:" in output # dataset section assert ( artifact.features.describe(return_str=True) == """Artifact: examples/dataset1.h5ad (0000) ├── Dataset features │ ├── obs (4) │ │ cell_type_by_expe… bionty.CellType B cell, CD8-positive, alph… │ │ cell_type_by_model bionty.CellType B cell, T cell │ │ perturbation Record DMSO, IFNG │ │ sample_note str │ └── var.T (3 bionty.G… │ CD14 num │ CD4 num │ CD8A num └── External features └── experiment Record Experiment 1 date_of_study date 2024-12-01 study_metadata dict {'detail1': '123', 'detail… study_note str We had a great time perfor… temperature float 21.6""" ) # labels section if ln.setup.settings.instance.dialect == "postgresql": description_tree = describe_postgres(artifact) else: description_tree = describe_sqlite(artifact) labels_node = description_tree.children[-1].label assert labels_node.label.plain == "Labels" assert len(labels_node.children[0].label.columns) == 3 assert len(labels_node.children[0].label.rows) == 2 assert labels_node.children[0].label.columns[0]._cells == [ ".records", ".cell_types", ] assert labels_node.children[0].label.columns[1]._cells[0].plain == "Record" assert labels_node.children[0].label.columns[1]._cells[1].plain == "bionty.CellType" assert { c.strip() for c in ",".join(labels_node.children[0].label.columns[2]._cells).split(",") } == { "DMSO", "IFNG", "Experiment 1", "B cell", "T cell", "CD8-positive", "alpha-beta T cell", } # set_values should only replace external features, not dataset-derived features values_before = artifact.features.get_values() adata = artifact.load() just_internal = { col: values_before[col] for col in adata.obs.columns if col in values_before } artifact.features.set_values({"temperature": 99.0}) values_after_set = artifact.features.get_values() assert {col: values_after_set[col] for col in just_internal} == just_internal assert values_after_set["temperature"] == 99.0 assert set(values_after_set.keys()) == set(just_internal) | {"temperature"} # test that only external feature are removed upon artifact.features.remove_values() alljson_values = artifact.features.get_values() artifact.features.remove_values() assert just_internal != alljson_values assert just_internal == artifact.features.get_values() artifact.delete(permanent=True) artifact2.delete(permanent=True) ln.Schema.get(name="anndata_ensembl_gene_ids_and_valid_features_in_obs").delete( permanent=True ) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.Gene.filter().delete(permanent=True) ln.Record.filter().delete(permanent=True) bt.CellType.filter().delete(permanent=True) ================================================ FILE: tests/core/test_artifact_features_annotations.py ================================================ # ruff: noqa: F811 from datetime import date, datetime import bionty as bt import lamindb as ln import pytest from lamindb.examples.datasets import mini_immuno from lamindb.models.query_set import BasicQuerySet, SQLRecordList # see test_record_basics.py for similar test for records (populate and query by features) def test_artifact_features_add_remove_query(): record_type1 = ln.Record(name="RecordType1", is_type=True).save() record_entity1 = ln.Record(name="entity1", type=record_type1).save() record_entity2 = ln.Record(name="entity2", type=record_type1).save() ulabel = ln.ULabel(name="test-ulabel").save() artifact = ln.Artifact(".gitignore", key="test-artifact").save() transform = ln.Transform(key="test-transform").save() run = ln.Run(transform, name="test-run").save() feature_str = ln.Feature(name="feature_str", dtype=str).save() feature_list_str = ln.Feature(name="feature_list_str", dtype=list[str]).save() feature_int = ln.Feature(name="feature_int", dtype=int).save() feature_float = ln.Feature(name="feature_float", dtype=float).save() feature_num = ln.Feature(name="feature_num", dtype="num").save() feature_datetime = ln.Feature(name="feature_datetime", dtype=datetime).save() feature_date = ln.Feature( name="feature_date", dtype=datetime.date, coerce=True ).save() feature_dict = ln.Feature(name="feature_dict", dtype=dict).save() feature_type1 = ln.Feature(name="feature_type1", dtype=record_type1).save() feature_type1s = ln.Feature(name="feature_type1s", dtype=list[record_type1]).save() feature_ulabel = ln.Feature(name="feature_ulabel", dtype=ln.ULabel).save() feature_user = ln.Feature(name="feature_user", dtype=ln.User).save() feature_project = ln.Feature(name="feature_project", dtype=ln.Project).save() feature_artifact = ln.Feature(name="feature_artifact", dtype=ln.Artifact).save() feature_artifact_2 = ln.Feature(name="feature_artifact_2", dtype=ln.Artifact).save() feature_run = ln.Feature(name="feature_run", dtype=ln.Run.uid).save() feature_cell_line = ln.Feature(name="feature_cell_line", dtype=bt.CellLine).save() ln.Feature(name="feature_cell_line_pass_list", dtype=bt.CellLine).save() feature_cell_lines = ln.Feature( name="feature_cell_lines", dtype=list[bt.CellLine] ).save() feature_cl_ontology_id = ln.Feature( name="feature_cl_ontology_id", dtype=bt.CellLine.ontology_id ).save() feature_gene_ontology_id = ln.Feature( name="feature_gene_ontology_id", dtype=bt.Gene.ensembl_gene_id ).save() test_artifact = ln.Artifact(".gitignore", key="test_artifact").save() value_artifact = ln.Artifact("pyproject.toml", key="value_artifact.toml").save() test_project = ln.Project(name="test_project").save() hek293 = bt.CellLine.from_source(name="HEK293").save() a549 = bt.CellLine.from_source(name="A-549").save() gene1 = bt.Gene.from_source(ensembl_gene_id="ENSG00000139618").save() gene2 = bt.Gene.from_source(ensembl_gene_id="ENSG00000141510").save() # no schema validation test_values = { "feature_str": "a string value", "feature_list_str": ["value1", "value2", "value3"], "feature_int": 42, "feature_float": 3.14, "feature_num": 2.71, "feature_datetime": datetime(2024, 1, 1, 12, 0, 0), "feature_date": date(2024, 1, 1), "feature_dict": {"key": "value", "number": 123, "list": [1, 2, 3]}, "feature_type1": "entity1", "feature_type1s": ["entity1", "entity2"], "feature_ulabel": "test-ulabel", "feature_user": ln.setup.settings.user.handle, "feature_project": "test_project", "feature_cell_line": "HEK293", # allowed if observational unit not specified, comes from aggregation "feature_cell_line_pass_list": ["HEK293", "A-549"], "feature_cell_lines": ["HEK293", "A-549"], "feature_cl_ontology_id": "CVCL_0045", "feature_artifact": "test-artifact", "feature_artifact_2": "value_artifact.toml", "feature_run": run.uid, } test_artifact.features.add_values(test_values) # ManyToMany accessors assert set(test_artifact.artifacts.to_list()) == {test_artifact, value_artifact} assert set(value_artifact.linked_by_artifacts.to_list()) == {test_artifact} assert set(test_artifact.linked_by_artifacts.to_list()) == {test_artifact} assert value_artifact.artifacts.to_list() == [] # get_values accessor return_values = test_artifact.features.get_values() # special handling if passing a list of categories to a cat feature: it's interpreted as the result of an aggregation # hence upon retrieval it's a set of categories, not a list of categories values_pass_list = return_values.pop("feature_cell_line_pass_list") assert values_pass_list == set(test_values.pop("feature_cell_line_pass_list")) assert return_values == test_values # __get_item__ accessor assert test_artifact.features["feature_str"] == test_values["feature_str"] assert test_artifact.features["feature_list_str"] == test_values["feature_list_str"] assert test_artifact.features["feature_int"] == test_values["feature_int"] assert test_artifact.features["feature_float"] == test_values["feature_float"] assert test_artifact.features["feature_num"] == test_values["feature_num"] assert test_artifact.features["feature_datetime"] == test_values["feature_datetime"] assert test_artifact.features["feature_date"] == test_values["feature_date"] assert test_artifact.features["feature_dict"] == test_values["feature_dict"] assert test_artifact.features["feature_type1"] == record_entity1 assert set(test_artifact.features["feature_type1s"]) == { record_entity1, record_entity2, } assert test_artifact.features["feature_ulabel"] == ulabel assert ( test_artifact.features["feature_user"].handle == ln.setup.settings.user.handle ) assert test_artifact.features["feature_project"] == test_project assert test_artifact.features["feature_cell_line"] == hek293 assert test_artifact.features["feature_cl_ontology_id"] == hek293 value = test_artifact.features["feature_cell_line_pass_list"] assert set(value) == {hek293, a549} assert isinstance(value, BasicQuerySet) value = test_artifact.features["feature_cell_lines"] assert set(value) == {hek293, a549} assert isinstance(value, SQLRecordList) assert test_artifact.features["feature_artifact"] == test_artifact assert test_artifact.features["feature_artifact_2"] == value_artifact assert test_artifact.features["feature_run"] == run # --- Query by features (same data as above) --- # Equality assert ln.Artifact.filter(feature_str="a string value").one() == test_artifact assert ln.Artifact.filter(feature_int=42).one() == test_artifact assert ln.Artifact.filter(feature_type1="entity1").one() == test_artifact assert ln.Artifact.filter(feature_cell_line="HEK293").one() == test_artifact assert ( ln.Artifact.filter(feature_str="a string value", feature_int=42).one() == test_artifact ) # Datetime and date (filter uses ISO strings as stored in JSON) assert ( ln.Artifact.filter(feature_datetime="2024-01-01T12:00:00").one() == test_artifact ) assert ln.Artifact.filter(feature_date="2024-01-01").one() == test_artifact # __contains (categorical) assert ln.Artifact.filter(feature_cell_line__contains="HEK").one() == test_artifact assert ln.Artifact.filter(feature_type1__contains="entity").one() == test_artifact # Invalid field with pytest.raises(ln.errors.InvalidArgument) as error: ln.Artifact.filter(feature_str_typo="x", feature_int=42).one() assert error.exconly().startswith( "lamindb.errors.InvalidArgument: You can query either by available fields:" ) # ln.errors.ObjectDoesNotExist (no object named "nonexistent_entity" exists) with pytest.raises(ln.errors.ObjectDoesNotExist) as error: ln.Artifact.filter(feature_type1="nonexistent_entity").one() assert "Did not find" in error.exconly() # Combined filter (3 keys) assert ( ln.Artifact.filter( feature_str="a string value", feature_int=42, feature_type1="entity1", ).one() == test_artifact ) # Bionty: filter by record assert ln.Artifact.filter(feature_cell_line=hek293).one() == test_artifact # Bionty: filter by ontology_id string assert ln.Artifact.filter(feature_cl_ontology_id="CVCL_0045").one() == test_artifact # Bionty __contains (ontology_id) assert ( ln.Artifact.filter(feature_cl_ontology_id__contains="0045").one() == test_artifact ) # ln.errors.ObjectDoesNotExist (object not found: feature_project) with pytest.raises(ln.errors.ObjectDoesNotExist) as error: ln.Artifact.filter(feature_project="nonexistent_project").one() assert "Did not find" in error.exconly() # __contains returns multiple (add second artifact, assert, then remove) value_artifact.features.add_values({"feature_type1": "entity2"}) assert len(ln.Artifact.filter(feature_type1__contains="entity")) == 2 value_artifact.features.remove_values("feature_type1") # Numeric comparators __lt, __gt (int, float, num) assert ln.Artifact.filter(feature_int__lt=21).one_or_none() is None assert len(ln.Artifact.filter(feature_int__gt=21)) >= 1 # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100) assert ln.Artifact.filter(feature_int__lt=5).one_or_none() is None assert ln.Artifact.filter(feature_int__gt=100).one_or_none() is None # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST) assert ln.Artifact.filter(feature_float__lt=5.0).one() == test_artifact assert ln.Artifact.filter(feature_float__gt=1.0).one() == test_artifact assert ln.Artifact.filter(feature_float__gt=10.0).one_or_none() is None assert ln.Artifact.filter(feature_num__lt=5.0).one() == test_artifact assert ln.Artifact.filter(feature_num__gt=1.0).one() == test_artifact assert ln.Artifact.filter(feature_num__gt=10.0).one_or_none() is None # Date and datetime comparators (ISO strings) assert ln.Artifact.filter(feature_date__lt="2024-01-02").one() == test_artifact assert ln.Artifact.filter(feature_date__gt="2023-12-31").one() == test_artifact assert ln.Artifact.filter(feature_date__gt="2024-01-02").one_or_none() is None assert ( ln.Artifact.filter(feature_datetime__lt="2024-01-01T13:00:00").one() == test_artifact ) assert ( ln.Artifact.filter(feature_datetime__gt="2024-01-01T11:00:00").one() == test_artifact ) assert ( ln.Artifact.filter(feature_datetime__lt="2024-01-01T11:00:00").one_or_none() is None ) # remove values # this was already popped from test_values above test_artifact.features.remove_values("feature_cell_line_pass_list") test_artifact.features.remove_values("feature_int") test_values.pop("feature_int") test_artifact.features.remove_values("feature_float") test_values.pop("feature_float") test_artifact.features.remove_values("feature_num") test_values.pop("feature_num") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_date") test_values.pop("feature_date") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_type1") test_values.pop("feature_type1") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_type1s") test_values.pop("feature_type1s") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_ulabel") test_values.pop("feature_ulabel") assert test_artifact.features.get_values() == test_values # test passing a list to remove_values test_artifact.features.remove_values(["feature_cell_line", "feature_user"]) test_values.pop("feature_cell_line") test_values.pop("feature_user") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_artifact") test_values.pop("feature_artifact") assert test_artifact.features.get_values() == test_values test_artifact.features.remove_values("feature_run") test_values.pop("feature_run") assert test_artifact.features.get_values() == test_values # test passing None has no effect, does not lead to annotation test_artifact.features.add_values( { "feature_int": None, "feature_float": None, "feature_num": None, "feature_type1": None, } ) assert test_artifact.features.get_values() == test_values # test bulk removal assert list(test_values.keys()) == [ "feature_str", "feature_list_str", "feature_datetime", "feature_dict", "feature_project", "feature_cell_lines", "feature_cl_ontology_id", "feature_artifact_2", ] test_artifact.features.remove_values() test_values = {} assert test_artifact.features.get_values() == test_values # test passing ISO-format date string for date test_artifact.features.add_values({"feature_date": "2024-01-01"}) test_values["feature_date"] = date(2024, 1, 1) assert test_artifact.features.get_values() == test_values # test passing bionty objects instead of strings (using gene1 and gene2 because organism-dependent ontologies) test_artifact.features.add_values({"feature_gene_ontology_id": [gene1, gene2]}) test_values["feature_gene_ontology_id"] = {"ENSG00000139618", "ENSG00000141510"} assert test_artifact.features.get_values() == test_values test_values.pop("feature_gene_ontology_id") test_artifact.features.remove_values("feature_gene_ontology_id") # test add_values() when there is already something there test_artifact.features.add_values({"feature_date": "2024-02-01"}) test_values["feature_date"] = {date(2024, 1, 1), date(2024, 2, 1)} test_artifact.features.add_values({"feature_str": "a string value"}) test_values["feature_str"] = "a string value" assert test_artifact.features.get_values() == test_values # test set_values() test_values = {} test_values["feature_date"] = date(2024, 3, 1) test_artifact.features.set_values({"feature_date": "2024-03-01"}) assert test_artifact.features.get_values() == test_values # schema validation feature_str = ln.Feature.get(name="feature_str") feature_int = ln.Feature.get(name="feature_int") schema = ln.Schema([feature_str, feature_int], name="test_schema").save() with pytest.raises(ln.errors.ValidationError) as error: test_artifact.features.add_values({"feature_type1": "entity1"}, schema=schema) assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly() schema.delete(permanent=True) # test with list of strings schema = ln.Schema([feature_cell_lines], name="test_schema2").save() test_artifact.features.add_values( {"feature_cell_lines": ["HEK293", "A-549"]}, schema=schema ) schema.delete(permanent=True) # test with list of records (rather than passing strings) schema = ln.Schema([feature_cell_lines], name="test_schema2").save() test_artifact.features.add_values( {"feature_cell_lines": [a549, hek293]}, schema=schema ) schema.delete(permanent=True) # clean up rest test_artifact.delete(permanent=True) feature_str.delete(permanent=True) feature_list_str.delete(permanent=True) feature_int.delete(permanent=True) feature_float.delete(permanent=True) feature_num.delete(permanent=True) feature_datetime.delete(permanent=True) feature_date.delete(permanent=True) feature_type1.delete(permanent=True) feature_type1s.delete(permanent=True) feature_user.delete(permanent=True) feature_project.delete(permanent=True) feature_dict.delete(permanent=True) feature_artifact.delete(permanent=True) feature_artifact_2.delete(permanent=True) feature_run.delete(permanent=True) feature_ulabel.delete(permanent=True) feature_cell_lines.delete(permanent=True) record_entity1.delete(permanent=True) record_entity2.delete(permanent=True) record_type1.delete(permanent=True) test_project.delete(permanent=True) feature_cell_line.delete(permanent=True) feature_cl_ontology_id.delete(permanent=True) feature_gene_ontology_id.delete(permanent=True) hek293.delete(permanent=True) a549.delete(permanent=True) gene1.delete(permanent=True) gene2.delete(permanent=True) ulabel.delete(permanent=True) artifact.delete(permanent=True) run.delete(permanent=True) transform.delete(permanent=True) def test_features_name_duplicates_across_root_and_nested(): feature1 = ln.Feature(name="sample_name", dtype=ln.Record).save() lab_a_type = ln.Feature(name="LabA", is_type=True).save() feature2 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_a_type).save() record_sample = ln.Record(name="sample").save() test_artifact = ln.Artifact(".gitignore", key="test_artifact").save() test_artifact.features.add_values({"sample_name": "sample"}) assert test_artifact.features.get_values() == {"sample_name": "sample"} test_artifact.delete(permanent=True) record_sample.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) lab_a_type.delete(permanent=True) # also see test_curator_schema_feature_mapping def test_features_name_duplicates_across_equal_levels(): lab_a_type = ln.Feature(name="LabA", is_type=True).save() feature1 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_a_type).save() lab_b_type = ln.Feature(name="LabB", is_type=True).save() feature2 = ln.Feature(name="sample_name", dtype=ln.Record, type=lab_b_type).save() schema1 = ln.Schema([feature1], name="Lab A schema").save() record_sample = ln.Record(name="sample").save() test_artifact = ln.Artifact(".gitignore", key="test_artifact").save() # cannot disambiguate without schema with pytest.raises(ln.errors.ValidationError) as error: test_artifact.features.add_values({"sample_name": "sample"}) assert ( "Ambiguous match for Feature 'sample_name': found 2 features at depth 1 (under types: ['LabA', 'LabB'])" in error.exconly() ) # with schema, first one test_artifact.features.add_values({"sample_name": "sample"}, schema=schema1) assert test_artifact.features.get_values() == {"sample_name": "sample"} assert test_artifact.links_record.get().feature.type == lab_a_type test_artifact.delete(permanent=True) test_artifact = ln.Artifact(".gitignore", key="test_artifact").save() # now the other schema schema2 = ln.Schema([feature2], name="Lab B schema").save() test_artifact.features.add_values({"sample_name": "sample"}, schema=schema2) assert test_artifact.features.get_values() == {"sample_name": "sample"} assert test_artifact.links_record.get().feature.type == lab_b_type test_artifact.delete(permanent=True) record_sample.delete(permanent=True) schema2.delete(permanent=True) schema1.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) lab_a_type.delete(permanent=True) lab_b_type.delete(permanent=True) def test_feature_predicate_queries_safe_hybrid(): lab_a_type = ln.Feature(name="PredLabA", is_type=True).save() feature_a = ln.Feature(name="pred_name", dtype=str, type=lab_a_type).save() lab_b_type = ln.Feature(name="PredLabB", is_type=True).save() feature_b = ln.Feature(name="pred_name", dtype=str, type=lab_b_type).save() score_feature = ln.Feature(name="pred_score", dtype=int).save() cell_type_feature = ln.Feature(name="pred_cell_type", dtype=bt.CellLine).save() # safe hybrid behavior for model identity + hashability assert feature_a == feature_a assert feature_a != feature_b assert len({feature_a, feature_b}) == 2 schema_a = ln.Schema([feature_a], name="pred schema a").save() schema_b = ln.Schema([feature_b], name="pred schema b").save() artifact_a = ln.Artifact( ".gitignore", key="pred-artifact-a", skip_hash_lookup=True, ).save() artifact_b = ln.Artifact( ".gitignore", key="pred-artifact-b", skip_hash_lookup=True, ).save() artifact_a.features.add_values({"pred_name": "hello"}, schema=schema_a) artifact_b.features.add_values({"pred_name": "hello"}, schema=schema_b) artifact_a.features.add_values({"pred_score": 5}) artifact_b.features.add_values({"pred_score": 1}) hek293 = bt.CellLine.from_source(name="HEK293").save() artifact_a.features.add_values({"pred_cell_type": hek293}) # same feature name can be disambiguated by passing the Feature object assert ln.Artifact.filter(feature_a == "hello").one() == artifact_a assert ln.Artifact.filter(feature_b == "hello").one() == artifact_b # Feature compared to another model should still generate a predicate assert ln.Artifact.filter(cell_type_feature == hek293).one() == artifact_a # comparator operators on non-categorical feature values assert ln.Artifact.filter(score_feature > 2).one() == artifact_a assert ln.Artifact.filter(score_feature <= 1).one() == artifact_b neq_results = ln.Artifact.filter(score_feature != 5) assert artifact_b in neq_results assert artifact_a not in neq_results # mixed predicate and regular kwargs filters assert ( ln.Artifact.filter(feature_a == "hello", key="pred-artifact-a").one() == artifact_a ) artifact_a.delete(permanent=True) artifact_b.delete(permanent=True) schema_a.delete(permanent=True) schema_b.delete(permanent=True) feature_a.delete(permanent=True) feature_b.delete(permanent=True) score_feature.delete(permanent=True) cell_type_feature.delete(permanent=True) lab_a_type.delete(permanent=True) lab_b_type.delete(permanent=True) hek293.delete(permanent=True) def test_features_add_with_schema(): df = mini_immuno.get_dataset1(otype="DataFrame") artifact = ln.Artifact.from_dataframe(df, description="test dataset").save() species = ln.Feature(name="species", dtype="str").save() split = ln.Feature(name="split", dtype="str").save() schema = ln.Schema([species, split]).save() with pytest.raises(ln.errors.ValidationError) as e: artifact.features.add_values({"doesnot": "exist"}, schema=schema) assert "column 'split' not in dataframe" in str(e.value) artifact.features.add_values({"species": "bird", "split": "train"}, schema=schema) artifact.save() assert artifact.features.get_values() == {"species": "bird", "split": "train"} artifact.delete(permanent=True) schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_artifact_feature_cat_filters_schema_end_to_end(): schema_feature = ln.Feature(name="schema_filter_column_e2e", dtype=str).save() required_schema = ln.Schema( name="required_schema_for_artifact_filter", features=[schema_feature], ).save() artifact_feature = ln.Feature( name="input_artifact", dtype=ln.Artifact, cat_filters={"schema": required_schema}, ).save() container_artifact = ln.Artifact( ".gitignore", key="container_for_artifact_schema_filter", skip_hash_lookup=True, ).save() artifact_without_schema = ln.Artifact( ".gitignore", key="artifact_without_required_schema", skip_hash_lookup=True, ).save() artifact_with_schema = ln.Artifact( ".gitignore", key="artifact_with_required_schema", schema=required_schema, skip_hash_lookup=True, ).save() try: with pytest.raises(ln.errors.ValidationError) as error: container_artifact.features.add_values( {"input_artifact": artifact_without_schema.key} ) assert "1 term not validated in feature 'input_artifact'" in error.exconly() container_artifact.features.add_values( {"input_artifact": artifact_with_schema.key} ) assert container_artifact.features["input_artifact"] == artifact_with_schema finally: container_artifact.delete(permanent=True) artifact_without_schema.delete(permanent=True) artifact_with_schema.delete(permanent=True) artifact_feature.delete(permanent=True) required_schema.delete(permanent=True) schema_feature.delete(permanent=True) def test_features_add_remove_error_behavior(): """Add/remove/validation behavior.""" adata = ln.examples.datasets.anndata_with_obs() artifact = ln.Artifact.from_anndata(adata, description="test").save() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"experiment": "Experiment 1"}) assert ( error.exconly() == """lamindb.errors.ValidationError: These keys could not be validated: ['experiment'] Here is how to create a feature: ln.Feature(name='experiment', dtype='cat ? str').save()""" ) ln.Feature(name="experiment", dtype=ln.Record).save() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"experiment": "Experiment 1"}) assert error.exconly().startswith( "lamindb.errors.ValidationError: 1 term not validated in feature 'experiment'" ) ln.Record(name="Experiment 1").save() # now add the label with the feature and make sure that it has the feature annotation artifact.features.add_values({"experiment": "Experiment 1"}) assert artifact.links_record.get().record.name == "Experiment 1" assert artifact.links_record.get().feature.name == "experiment" # repeat artifact.features.add_values({"experiment": "Experiment 1"}) assert artifact.links_record.get().record.name == "Experiment 1" # numerical feature temperature = ln.Feature(name="temperature", dtype=ln.Record).save() with pytest.raises(TypeError) as error: artifact.features.add_values({"temperature": 27.2}) assert error.exconly().startswith( "TypeError: Type mismatch: identifiers are 'numeric' but field_values are 'str/categorical'." ) temperature.delete(permanent=True) temperature = ln.Feature(name="temperature", dtype="num").save() artifact.features.add_values({"temperature": 27.2}) assert artifact.json_values.first().value == 27.2 # datetime feature with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"date_of_experiment": "2024-12-01"}) assert ( error.exconly() == """lamindb.errors.ValidationError: These keys could not be validated: ['date_of_experiment'] Here is how to create a feature: ln.Feature(name='date_of_experiment', dtype='date').save()""" ) ln.Feature(name="date_of_experiment", dtype=datetime.date, coerce=True).save() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"date_of_experiment": "Typo2024-12-01"}) assert "WRONG_DATATYPE" in error.exconly() artifact.features.add_values({"date_of_experiment": "2024-12-01"}) ln.Feature(name="datetime_of_experiment", dtype=datetime, coerce=True).save() artifact.features.add_values({"datetime_of_experiment": "2024-12-01 00:00:00"}) # bionty feature mouse = bt.Organism.from_source(name="mouse") with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"organism": mouse}) assert ( error.exconly() == """lamindb.errors.ValidationError: These keys could not be validated: ['organism'] Here is how to create a feature: ln.Feature(name='organism', dtype='cat[bionty.Organism]').save()""" ) ln.Feature(name="organism", dtype=bt.Organism).save() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values({"organism": mouse}) assert ( # ensure the label is saved error.exconly() == "lamindb.errors.ValidationError: Organism mouse is not saved." ) mouse.save() artifact.features.add_values({"organism": mouse}) assert artifact.organisms.get().name == "mouse" # lists of records diseases = bt.Disease.from_values( ["MONDO:0004975", "MONDO:0004980"], field=bt.Disease.ontology_id ).save() ln.Feature(name="disease", dtype=bt.Disease.ontology_id).save() artifact.features.add_values({"disease": diseases}) assert len(artifact.diseases.filter()) == 2 # check get_values returns ontology_ids as specified in the feature dtype assert artifact.features.get_values()["disease"] == { "MONDO:0004975", "MONDO:0004980", } # big dictionary of everything features = { "experiment": [ # we're testing iterable annotation here "Experiment 2", "Experiment 1", ], "project": "project_1", "is_validated": True, "cell_type_by_expert": "T cell", "temperature": 100.0, "donor": "U0123", } with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values(features) assert ( error.exconly() == """\ lamindb.errors.ValidationError: These keys could not be validated: ['project', 'is_validated', 'cell_type_by_expert', 'donor'] Here is how to create a feature: ln.Feature(name='project', dtype='cat ? str').save() ln.Feature(name='is_validated', dtype='bool').save() ln.Feature(name='cell_type_by_expert', dtype='cat ? str').save() ln.Feature(name='donor', dtype='cat ? str').save()""" ) ln.Feature(name="project", dtype=ln.Record).save() ln.Feature(name="is_validated", dtype=bool).save() ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() ln.Feature(name="donor", dtype=ln.Record).save() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values(features) error_msg = error.exconly() assert ( "lamindb.errors.ValidationError: These values could not be validated:" in error_msg ) assert "Here is how to create records for them:" in error_msg expected_values = { "Record": ["project_1", "U0123", "Experiment 2"], "bionty.CellType": ["T cell"], } for key, values in expected_values.items(): assert f"'{key}':" in error_msg for value in values: assert value in error_msg assert f"{key.split('.')[-1]}.from_values(" in error_msg assert "create=True).save()" in error_msg ln.Record.from_values(["Experiment 2", "project_1", "U0123"], create=True).save() bt.CellType.from_source(name="T cell").save() artifact.features.add_values(features) assert set(artifact.json_values.all().values_list("value", flat=True)) == { 27.2, True, 100.0, "2024-12-01", "2024-12-01T00:00:00", } assert ln.Artifact.get(json_values__value=27.2) assert artifact.features.get_values() == { "disease": {"MONDO:0004975", "MONDO:0004980"}, "experiment": {"Experiment 1", "Experiment 2"}, "project": "project_1", "cell_type_by_expert": "T cell", "donor": "U0123", "organism": "mouse", "is_validated": True, "temperature": {27.2, 100.0}, "date_of_experiment": date(2024, 12, 1), "datetime_of_experiment": datetime(2024, 12, 1, 0, 0, 0), } # hard to test because of italic formatting assert ( artifact.features.describe(return_str=True) == """Artifact: (0000) | description: test └── Features └── cell_type_by_expe… bionty.CellType T cell disease bionty.Disease.ontolog… MONDO:0004975, MONDO:00049… donor Record U0123 experiment Record Experiment 1, Experiment 2 organism bionty.Organism mouse project Record project_1 date_of_experiment date 2024-12-01 datetime_of_exper… datetime 2024-12-01 00:00:00 is_validated bool True temperature num 27.2, 100.0""" ) # repeat artifact.features.add_values(features) assert set(artifact.json_values.all().values_list("value", flat=True)) == { 27.2, True, 100.0, "2024-12-01", "2024-12-01T00:00:00", } # test remove_values artifact.features.remove_values("date_of_experiment") alzheimer = bt.Disease.get(name="Alzheimer disease") artifact.features.remove_values("disease", value=alzheimer) values = artifact.features.get_values() assert "date_of_experiment" not in values assert "MONDO:0004975" not in values["disease"] # test annotate with dictionaries multiple times ln.Feature(name="study_metadata", dtype=dict).save() artifact.features.add_values({"study_metadata": {"detail1": "123", "detail2": 1}}) # delete everything we created artifact.delete(permanent=True) ln.Record.filter().delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.Gene.filter().delete(permanent=True) bt.Organism.filter().delete(permanent=True) bt.Disease.filter().delete(permanent=True) def test_add_remove_list_features(ccaplog): feature = ln.Feature(name="list_of_str", dtype=list[str]).save() artifact = ln.Artifact(".gitignore", key=".gitignore").save() artifact.features.add_values({"list_of_str": ["1", "2", "3"]}) assert artifact.features.get_values() == {"list_of_str": ["1", "2", "3"]} # remove a non-linked value, this should do nothing but print a warning artifact.features.remove_values("list_of_str", value="4") assert "no feature 'list_of_str' with value '4' found" in ccaplog.text # list of categories feature cell_types_feature = ln.Feature( name="cell_types", dtype="list[cat[bionty.CellType]]" ).save() bt.CellType.from_values(["T cell", "B cell"]).save() artifact.features.add_values({"cell_types": ["T cell", "B cell"]}) assert set(artifact.features.get_values()["cell_types"]) == {"B cell", "T cell"} # passing value works here because we are linking each of the cell types in the list individually # in comparison to passing a list of numbers above t_cell = bt.CellType.get(name="T cell") artifact.features.remove_values("cell_types", value=t_cell) assert artifact.features.get_values()["cell_types"] == ["B cell"] # remove a non-linked value, this should print a warning but do nothing artifact.features.remove_values("cell_types", value=t_cell.parents.first()) assert "no feature 'cell_types' with value CellType(" in ccaplog.text # remove the entire linked feature artifact.features.remove_values("cell_types") assert "cell_types" not in artifact.features.get_values() # clean up artifact.delete(permanent=True) assert ln.models.JsonValue.filter(feature__name="list_of_str").count() == 1 feature.delete(permanent=True) assert ln.models.JsonValue.filter(feature__name="list_of_str").count() == 0 cell_types_feature.delete(permanent=True) bt.CellType.filter().delete(permanent=True) def test_add_list_of_cat_features(): type_1 = ln.Record(name="Type 1", is_type=True).save() for label in ["label 1", "label 2", "label 3"]: ln.Record(name=label, type=type_1).save() feat1 = ln.Feature( name="single_label_of_type1", dtype=type_1, nullable=False ).save() feat2 = ln.Feature( name="list_of_labels_of_type1", dtype=list[type_1], nullable=False ).save() schema = ln.Schema(name="Test schema", features=[feat1, feat2]).save() artifact = ln.Artifact( ".gitignore", key=".gitignore", ).save() # now just use add_values() with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values( { "single_label_of_type1": "invalid", } ) assert error.exconly().startswith( "lamindb.errors.ValidationError: 1 term not validated in feature 'single_label_of_type1': 'invalid'" ) # now for list of labels with pytest.raises(ln.errors.ValidationError) as error: artifact.features.add_values( { "list_of_labels_of_type1": ["invalid", "invalid2"], } ) assert error.exconly().startswith( "lamindb.errors.ValidationError: 2 terms not validated in feature 'list_of_labels_of_type1':" ) artifact.delete(permanent=True) # now with schema artifact = ln.Artifact( ".gitignore", key=".gitignore", schema=schema, features={ "single_label_of_type1": "label 1", "list_of_labels_of_type1": ["label 1", "label 2"], }, ).save() with pytest.raises(ValueError) as error: artifact.features.add_values( { "single_label_of_type1": "invalid", } ) assert "Cannot add values if artifact has external schema." in error.exconly() artifact.delete(permanent=True) schema.delete(permanent=True) feat1.delete(permanent=True) feat2.delete(permanent=True) type_1.records.all().delete(permanent=True) type_1.delete(permanent=True) def test_artifact_features_accept_feature_object_keys(): feature_score = ln.Feature(name="artifact_feature_object_score", dtype=int).save() feature_tag = ln.Feature(name="artifact_feature_object_tag", dtype=str).save() artifact = ln.Artifact(".gitignore", key="artifact_feature_object_test").save() artifact.features.add_values({feature_score: 7, "artifact_feature_object_tag": "a"}) assert artifact.features.get_values() == { "artifact_feature_object_score": 7, "artifact_feature_object_tag": "a", } # set_values should also accept Feature objects as dictionary keys. artifact.features.set_values({feature_score: 8}) assert artifact.features.get_values() == {"artifact_feature_object_score": 8} artifact.features.add_values({feature_tag: "keep"}) assert artifact.features.get_values() == { "artifact_feature_object_score": 8, "artifact_feature_object_tag": "keep", } # remove_values supports dictionary inputs with Feature keys. artifact.features.remove_values({feature_score: 8, feature_tag: None}) assert artifact.features.get_values() == {} artifact.delete(permanent=True) feature_score.delete(permanent=True) feature_tag.delete(permanent=True) ================================================ FILE: tests/core/test_artifact_parquet.py ================================================ import lamindb as ln import pandas as pd import pyarrow.parquet as pq def test_parquet_kwargs(): df = pd.DataFrame( { "a": [3, 1, 4, 2], "b": ["c", "a", "d", "b"], "c": [3.3, 1.1, 4.4, 2.2], } ) df_sorted = df.sort_values(by=["a", "b"]) sorting_columns = [ pq.SortingColumn(0, descending=False, nulls_first=False), pq.SortingColumn(1, descending=False, nulls_first=False), ] artifact = ln.Artifact.from_dataframe( df_sorted, key="df_sorted.parquet", parquet_kwargs={"sorting_columns": sorting_columns}, ).save() pyarrow_dataset = artifact.open() fragment = next(pyarrow_dataset.get_fragments()) assert list(fragment.metadata.row_group(0).sorting_columns) == sorting_columns ================================================ FILE: tests/core/test_blocks.py ================================================ import lamindb as ln import pytest def test_block_recovery_based_on_hash(): block1 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme").save() block2 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme") assert block1 == block2 block1.delete() block2 = ln.models.Block(key="__lamindb_block__", content="1", kind="readme") assert block1 != block2 block1.delete(permanent=True) def test_block_recovery_based_on_key(): block1 = ln.models.Block(key="__lamindb_block__", kind="readme").save() block2 = ln.models.Block(key="__lamindb_block__", kind="readme") assert block1 == block2 block1.delete() block2 = ln.models.Block(key="__lamindb_block__", kind="readme") assert block1 != block2 block1.delete(permanent=True) def test_readme_md_key_is_allowed_and_revises(): block1 = ln.models.Block( key="README.md", content="# v1\n\nhello", kind="readme" ).save() block2 = ln.models.Block(key="README.md", content="# v2\n\nhello", kind="readme") assert block2.stem_uid == block1.stem_uid assert block2.uid != block1.uid block2.save() block1.refresh_from_db() assert not block1.is_latest block2.delete() block1.delete() def test_revise_blocks(): # attempt to create a block with an invalid version with pytest.raises(ValueError) as error: ln.models.Block(key="__lamindb_block__", version=0, kind="readme") assert "version" in error.exconly() or "version_tag" in error.exconly() # create a versioned block block = ln.models.Block(key="__lamindb_block__", version="1", kind="readme") assert block.version_tag == "1" assert block.version == "1" assert len(block.uid) == ln.models.Block._len_full_uid == 20 assert len(block.stem_uid) == ln.models.Block._len_stem_uid == 16 block.save() # try to reload the same block with the same uid block_reload = ln.models.Block( uid=block.uid, key="__lamindb_artifact__", kind="readme" ) assert block_reload.id == block.id assert block_reload.key == "__lamindb_block__" # unchanged, prints logging # create new block from old block block_r2 = ln.models.Block(content="v2", revises=block, kind="readme") assert block_r2.uid != block.uid assert block_r2.uid.endswith("0001") block_r2 = ln.models.Block(content="v2", revises=block, kind="readme") assert block_r2.uid != block.uid assert block_r2.uid.endswith("0001") assert block_r2.stem_uid == block.stem_uid assert block_r2.version_tag is None assert block_r2.version == block_r2.uid[-4:] assert block_r2.is_latest assert block.is_latest block_r2.save() assert not block.is_latest # create new block from newly versioned block block_r3 = ln.models.Block( content="v3", revises=block_r2, version="2", kind="readme" ) assert block_r3.stem_uid == block.stem_uid assert block_r3.version_tag == "2" assert block_r3.version == "2" # revise by matching on key key = "__lamindb_artifact__" block_r2.key = key block_r2.save() assert block_r2.is_latest block_r3 = ln.models.Block(content="v3", key=key, version="2", kind="readme") assert block_r3.uid[:-4] == block_r2.uid[:-4] assert block_r3.uid != block_r2.uid # new version after block_r2 block_r2.content = "something else" block_r2.save() block_r3 = ln.models.Block(content="v3", key=key, version="2", kind="readme") assert block_r3.uid[:-4] == block_r2.uid[:-4] assert block_r3.uid != block_r2.uid # yet another new version assert block_r3.stem_uid == block_r2.stem_uid assert block_r3.key == key assert block_r3.version_tag == "2" assert block_r3.version == "2" assert block_r3.is_latest assert block_r2.is_latest assert block_r3._revises is not None block_r3.save() block_r2 = ln.models.Block.get(block_r2.uid) assert not block_r2.is_latest # wrong block type with pytest.raises(TypeError) as error: ln.models.Block( key="__lamindb_block__", revises=ln.Record(name="x"), kind="readme" ) assert error.exconly().startswith("TypeError: `revises` has to be of type `Block`") # wrong kwargs with pytest.raises(ValueError) as error: ln.models.Block(key="__lamindb_block__", x=1, kind="readme") assert "can be passed" in error.exconly() and "x" in error.exconly() # kind required (Block only supports kind="readme") with pytest.raises(ValueError) as error: ln.models.Block(key="__lamindb_block__", content="y") assert "kind" in error.exconly() and "readme" in error.exconly() # invalid kind (Block only supports readme) with pytest.raises(ValueError) as error: ln.models.Block(key="__lamindb_block__", content="y", kind="comment") assert "readme" in error.exconly() or "Only kind" in error.exconly() # cleanup block_r2.delete() block.delete() # unversioned block block = ln.models.Block(key="__lamindb_block__", kind="readme") assert block.version_tag is None assert block.version == block.uid[-4:] block.save() # create new block from old block new_block = ln.models.Block(content="new", revises=block, kind="readme") assert block.version_tag is None assert block.version == block.uid[-4:] assert new_block.stem_uid == block.stem_uid assert new_block.uid.endswith("0001") assert new_block.version_tag is None assert new_block.version == new_block.uid[-4:] block.delete(permanent=True) def test_record_block_readme_always_new_version(): """Readme always creates a new version (no content-hash dedup).""" record = ln.Record(name="test-record-blocks").save() block1 = ln.models.RecordBlock(record=record, content="1", kind="readme").save() block2 = ln.models.RecordBlock(record=record, content="1", kind="readme") assert block1.stem_uid == block2.stem_uid assert block1.uid != block2.uid # new version each time block1.delete() # BaseSQLRecord has no soft delete; this is permanent block2 = ln.models.RecordBlock(record=record, content="1", kind="readme") assert block1 != block2 # block2 is a new block (block1 was removed) record.delete(permanent=True) def test_record_block_comment_always_new_block(): """Comment always creates a new block (no versioning; revises not allowed).""" record = ln.Record(name="test-record-blocks-comment").save() # Add readme and comments to test full describe ln.models.RecordBlock( record=record, content="# Overview\n\nTest readme.", kind="readme" ).save() # Comments never version: each creation is a new comment (new uid). comment1 = ln.models.RecordBlock( record=record, content="same text", kind="comment" ).save() comment2 = ln.models.RecordBlock(record=record, content="same text", kind="comment") assert comment1.stem_uid != comment2.stem_uid # always new comment, no dedup # revises is not allowed for kind='comment' with pytest.raises(ValueError) as error: ln.models.RecordBlock( record=record, content="a comment", kind="comment", revises=comment1 ) assert "revises is not allowed for kind='comment'" in error.exconly() # Test full describe call with include="comments" result = record.describe(return_str=True, include="comments") assert "README" in result assert "comment by" in result assert "same text" in result comment1.delete() record.delete(permanent=True) def test_record_block_recovery_based_on_record_and_kind(): record = ln.Record(name="test-record-blocks-key").save() block1 = ln.models.RecordBlock(record=record, kind="readme").save() block2 = ln.models.RecordBlock(record=record, kind="readme") assert block1 == block2 block1.delete() # BaseSQLRecord has no soft delete; this is permanent block2 = ln.models.RecordBlock(record=record, kind="readme") assert block1 != block2 # block2 is a new block (block1 was removed) record.delete(permanent=True) def test_revise_record_blocks(): record = ln.Record(name="test-record-revise").save() # create a versioned record block block = ln.models.RecordBlock( record=record, content="v1", kind="readme", version="1" ) assert block.version_tag == "1" assert block.version == "1" assert len(block.uid) == ln.models.RecordBlock._len_full_uid == 20 assert len(block.stem_uid) == ln.models.RecordBlock._len_stem_uid == 16 block.save() # reload same block by uid block_reload = ln.models.RecordBlock(record=record, uid=block.uid, kind="readme") assert block_reload.id == block.id # create new block from old block block_r2 = ln.models.RecordBlock( record=record, content="v2", kind="readme", revises=block ) assert block_r2.uid != block.uid assert block_r2.uid.endswith("0001") assert block_r2.stem_uid == block.stem_uid assert block_r2.is_latest assert block.is_latest block_r2.save() assert not block.is_latest # create new block from newly versioned block block_r3 = ln.models.RecordBlock( record=record, content="v3", kind="readme", revises=block_r2, version="2" ) assert block_r3.stem_uid == block.stem_uid assert block_r3.version_tag == "2" assert block_r3.version == "2" # readme always creates a new version (no hash-based dedup) block_r3.save() # so next readme for this record gets revises=block_r3 block_same = ln.models.RecordBlock(record=record, content="v3", kind="readme") assert block_same.stem_uid == block_r3.stem_uid assert block_same.uid != block_r3.uid # new version (0003) # comment does not accept revises with pytest.raises(ValueError) as error: ln.models.RecordBlock( record=record, content="a comment", kind="comment", revises=block ) assert "revises is not allowed for kind='comment'" in error.exconly() # wrong kwargs with pytest.raises(ValueError) as error: ln.models.RecordBlock(record=record, x=1) assert "can be passed" in error.exconly() # record required with pytest.raises(ValueError) as error: ln.models.RecordBlock(content="x", kind="readme") assert "record is required" in error.exconly() block_r2.delete() block.delete() record.delete(permanent=True) def test_record_block_filter_respects_default_branch_scope(): main_branch = ln.Branch.get(name="main") ln.setup.switch(main_branch.name) main_record = ln.Record(name="record-block-main").save() ln.models.RecordBlock( record=main_record, content="record-block-main-content", kind="readme", branch=main_branch, created_on=main_branch, ).save() contrib = ln.Branch(name="record_block_scope_branch").save() ln.setup.switch(contrib.name) contrib_record = ln.Record(name="record-block-contrib").save() contrib_block = ln.models.RecordBlock( record=contrib_record, content="record-block-contrib-content", kind="readme", branch=contrib, created_on=contrib, ).save() assert ( ln.models.RecordBlock.filter(content="record-block-contrib-content").count() == 1 ) ln.setup.switch(main_branch.name) assert ( ln.models.RecordBlock.filter(content="record-block-contrib-content").count() == 0 ) contrib_block.delete() contrib_record.delete(permanent=True) main_record.delete(permanent=True) contrib.delete(permanent=True) ================================================ FILE: tests/core/test_branches.py ================================================ import lamindb as ln def testbranch_id(): # create a file with default branch_id with open("./testbranch_id.txt", "w") as f: f.write("branch_id") artifact = ln.Artifact("./testbranch_id.txt", description="testbranch_id").save() assert artifact.branch_id == 1 # create a collection from file collection = ln.Collection(artifact, key="testbranch_id").save() # delete a collection will put both collection but not linked artifact in trash collection.delete() assert collection.ordered_artifacts[0].branch_id == 1 result = ln.Collection.filter(key="testbranch_id") assert len(result) == 0 result = ln.Collection.filter(key="testbranch_id", branch_id=1) assert len(result) == 0 result = ln.Collection.filter(key="testbranch_id", branch_id=None) assert len(result) == 1 # restore collection.restore() assert collection.branch_id == 1 assert collection.ordered_artifacts[0].branch_id == 1 # permanent delete collection.delete(permanent=True) result = ln.Artifact.filter(description="testbranch_id", branch_id=None) # also permanently deleted linked file assert len(result) == 1 ================================================ FILE: tests/core/test_can_curate.py ================================================ import bionty as bt import lamindb as ln import pytest from lamindb.errors import ValidationError # some validate tests are in test_queryset def test_inspect(): ln.Schema.filter().delete(permanent=True) bt.Gene.filter().delete(permanent=True) result = bt.Gene.inspect("TCF7", "symbol", organism="human") assert result.validated == [] bt.Gene.from_source(symbol="TCF7", organism="human").save() result = bt.Gene.inspect("TCF7", organism="human") assert bt.Gene.validate("TCF7", organism="human") result = bt.Gene.inspect(["TCF7", "ABC1"], "symbol", organism="human") assert result.validated == ["TCF7"] # clean up bt.Gene.filter().delete(permanent=True) # if a record was added to the DB via a different source # it will still be validated because it's in the DB def test_inspect_source(): source1 = bt.Source.get(entity="bionty.CellType", name="cl") source2 = bt.CellType.add_source(source="cl", version="2022-08-16") bt.CellType.from_source(name="T cell", source=source1).save() assert bt.CellType.inspect("T-cell", source=source2, mute=True).synonyms_mapper == { "T-cell": "T cell" } assert ( bt.CellType.inspect( "T-cell", source=source2, mute=True, strict_source=True ).synonyms_mapper == {} ) assert bt.CellType.validate("T cell", source=source2, mute=True).sum() == 1 assert ( bt.CellType.validate( "T cell", source=source2, mute=True, strict_source=True ).sum() == 0 ) assert bt.CellType.standardize("T-cell", source=source2, mute=True) == "T cell" # here still standardized because of bionty assert ( bt.CellType.standardize("T-cell", source=source2, mute=True, strict_source=True) == "T cell" ) bt.CellType.filter().delete(permanent=True) def test_standardize(): # synonym not in the database result = bt.Gene.standardize(["ABC1", "PDCD1"], organism="human") assert result == ["HEATR6", "PDCD1"] result = bt.Gene.standardize( ["ABC1", "PDCD1"], field=bt.Gene.symbol, organism="human" ) assert result == ["HEATR6", "PDCD1"] mapper = bt.Gene.standardize( ["ABC1", "PDCD1"], return_mapper=True, organism="human" ) assert mapper == {"ABC1": "HEATR6"} # synonym already in the database bt.Gene.from_source(symbol="LMNA", organism="human").save() mapper = bt.Gene.standardize(["ABC1", "LMN1"], return_mapper=True, organism="human") assert mapper == {"LMN1": "LMNA", "ABC1": "HEATR6"} assert bt.Gene.standardize(["LMNA"], organism="human") == ["LMNA"] assert bt.Gene.standardize("LMNA", organism="human") == "LMNA" assert bt.Gene.standardize(["LMN1"], return_mapper=True, organism="human") == { "LMN1": "LMNA" } def test_standardize_from_source(): result = bt.Gene.standardize(["ABC1", "PDCD1"], from_source=False) assert result == ["ABC1", "PDCD1"] def test_add_remove_synonym(): bt.CellType.filter().delete(permanent=True) # a registry that doesn't have a synonyms column user = ln.User.get(handle=ln.setup.settings.user.handle) with pytest.raises(NotImplementedError): user.add_synonym("syn") cell_types = bt.CellType.from_values(["T cell", "B cell"], "name") ln.save(cell_types) tcell = bt.CellType.get(name="T cell") bcell = bt.CellType.get(name="B cell") tcell.add_synonym(["my cell type"]) tcell.add_synonym("") tcell.add_synonym([]) assert "my cell type" in tcell.synonyms with pytest.raises(ValidationError): bcell.add_synonym("my cell type") with pytest.raises(ValidationError): tcell.add_synonym("my|celltype") tcell.remove_synonym("my cell type") assert "my cell type" not in tcell.synonyms bcell.synonyms = None bcell.save() tcell.synonyms = None tcell.save() tcell.add_synonym("") tcell.add_synonym([""]) tcell.add_synonym([]) tcell.add_synonym(["my cell type"]) tcell.add_synonym("") tcell.add_synonym([""]) tcell.add_synonym([]) assert tcell.synonyms == "my cell type" tcell.remove_synonym("my cell type") # clean up bt.CellType.filter().delete(permanent=True) def test_set_abbr(): bt.CellType.filter().delete(permanent=True) bt.CellType(name="my cell type").save() record = bt.CellType.get(name="my cell type") # if abbr is name, do not add to synonyms record.set_abbr("my cell type") assert record.abbr == "my cell type" assert record.synonyms is None record.set_abbr("myct") assert record.abbr == "myct" assert "myct" in record.synonyms source = bt.Source.filter(organism="human").first() with pytest.raises(AttributeError) as error: source.set_abbr("abbr") assert ( error.exconly() == "AttributeError: 'Source' object has no attribute 'set_abbr'" ) record.delete() def test_validate_int(): result = ln.User.validate([1, 2, 3], field=ln.User.id) assert result.sum() == 1 def test_synonym_mapping(): # only name field can be standardized bt.Gene.from_source(symbol="TNFRSF4", organism="human").save() result = bt.Gene.inspect( ["CD134", "TNFRSF4"], field=bt.Gene.symbol, organism="human" ) assert result.synonyms_mapper == {"CD134": "TNFRSF4"} result = bt.Gene.inspect( ["CD134", "TNFRSF4"], field=bt.Gene.ensembl_gene_id, organism="human" ) assert result.synonyms_mapper == {} bt.Gene.filter().delete(permanent=True) def test_validate_called_on_object_raises_error(): """Calling validate() on an object must raise TypeError.""" label = ln.ULabel(name="test_label").save() with pytest.raises(TypeError) as error: label.validate(["test_value"]) assert ( "ULabel.validate() is a class method and must be called on the ULabel class, not on a ULabel object" in str(error.value) ) def test_standardize_source(): """When passing a specific source to standardize, any matched public records must come from the passed source.""" # 'HANCESTRO:0006' in Hancestro 3.0 but 'HANCESTRO:0848' in later versions assert ( bt.Ethnicity.standardize( ["South Asian"], field="name", return_field="ontology_id", source=bt.Source( entity="bionty.Ethnicity", version="3.0", name="hancestro", organism="human", ), )[0] == "HANCESTRO:0006" ) ================================================ FILE: tests/core/test_collection.py ================================================ import re import anndata as ad import lamindb as ln import numpy as np import pandas as pd import pytest from lamindb.errors import FieldValidationError from scipy.sparse import csc_matrix, csr_matrix @pytest.fixture(scope="module") def df(): return pd.DataFrame({"feat1": [1, 2], "feat2": [3, 4]}) @pytest.fixture(scope="module") def adata(): return ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, raw={"X": np.array([[8, 9, 10, 11], [12, 13, 14, 15]])}, ) @pytest.fixture(scope="module") def adata2(): return ad.AnnData( X=np.array([[1, 2, 5], [4, 5, 8]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) def test_from_single_artifact(adata): features = ln.Feature.from_dataframe(adata.obs) validated = ln.Feature.validate( [feature.name for feature in features], field="name" ) ln.save([feature for (feature, valid) in zip(features, validated) if valid]) artifact = ln.Artifact.from_anndata(adata, description="My adata") if not artifact._state.adding: artifact.delete(permanent=True) # make sure we get a fresh one artifact = ln.Artifact.from_anndata(adata, description="My adata") with pytest.raises(ValueError) as error: ln.Collection(artifact, key="Test") assert str(error.exconly()).startswith( "ValueError: Not all artifacts are yet saved, please save them" ) artifact.save() with pytest.raises(ValueError) as error: ln.Collection(artifact, artifact) assert str(error.exconly()).startswith( "ValueError: Only one non-keyword arg allowed: artifacts" ) transform = ln.Transform(key="My test transform").save() run = ln.Run(transform).save() collection = ln.Collection(artifact, key="My new collection", run=run).save() assert collection.run.input_artifacts.get() == artifact collection.delete(permanent=True) artifact.delete(permanent=True) assert ln.Artifact.filter(id=artifact.id).one_or_none() is None def test_edge_cases(df, ccaplog): with pytest.raises( FieldValidationError, match=re.escape( "Only artifacts, key, description, meta, reference, reference_type, run, revises, skip_hash_lookup can be passed" ), ) as error: ln.Collection(df, invalid_param=1) with pytest.raises(ValueError) as error: ln.Collection(1, key="Invalid") assert str(error.exconly()).startswith( "ValueError: Artifact or list[Artifact] is allowed." ) artifact = ln.Artifact.from_dataframe(df, description="Test artifact") assert artifact._state.adding with pytest.raises(ValueError) as error: ln.Collection([artifact]) assert str(error.exconly()).startswith( "ValueError: Not all artifacts are yet saved, please save them" ) artifact.save() ln.Collection([artifact, artifact], key="test-collection") assert "your collection contains artifacts with non-unique hashes:" in ccaplog.text artifact.delete(permanent=True) def test_from_inconsistent_artifacts(df, adata): artifact1 = ln.Artifact.from_dataframe(df, description="My test").save() artifact2 = ln.Artifact.from_anndata(adata, description="My test2").save() collection = ln.Collection([artifact1, artifact2], key="Inconsistent").save() # test idempotency of .save() collection.save() # create a run context ln.track(transform=ln.Transform(key="My test transform")) # can iterate over them collection.cache() assert set(ln.context.run.input_collections.all()) == {collection} # loading will throw an error here with pytest.raises(ValueError) as error: collection.load() assert str(error.exconly()).startswith( "ValueError: Can only load collections where all artifacts have the same suffix" ) # test through query set with pytest.raises(ValueError) as error: collection.artifacts.all().load() assert str(error.exconly()).startswith( "ValueError: Can only load collections where all artifacts have the same suffix" ) collection.describe() collection.delete(permanent=True) artifact1.delete(permanent=True) artifact2.delete(permanent=True) ln.context._run = None def test_from_consistent_artifacts(adata, adata2): artifact1 = ln.Artifact.from_anndata(adata, key="my_test.h5ad").save() artifact2 = ln.Artifact.from_anndata(adata2, key="my_test.h5ad").save() transform = ln.Transform(key="My test transform").save() run = ln.Run(transform).save() initial_key = "My test" collection = ln.Collection([artifact1, artifact2], key=initial_key, run=run) assert collection._state.adding collection.save() assert set(collection.run.input_artifacts.all()) == {artifact1, artifact2} adata_joined = collection.load() assert "artifact_uid" in adata_joined.obs.columns assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories # test from query set through collection adata_joined = collection.artifacts.order_by("-created_at").load() assert "artifact_uid" in adata_joined.obs.columns assert artifact1.uid in adata_joined.obs.artifact_uid.cat.categories # re-run with hash-based lookup collection2 = ln.Collection([artifact1, artifact2], key="My test 1", run=run) assert collection2 == collection assert collection2.key == "My test 1" # key is updated # skip hash lookup collection2 = ln.Collection( [artifact1, artifact2], key="My test 1", run=run, skip_hash_lookup=True ) assert collection2 != collection # let hash uniqueness constraint fail and database return the existing record collection2 = ln.Collection( [artifact1, artifact2], key=initial_key, run=run, skip_hash_lookup=True ).save() assert collection2 == collection # move to trash and then re-run collection.delete() collection2 = ln.Collection([artifact1, artifact2], key="My test 2", run=run) assert collection2 != collection assert collection2.key == "My test 2" collection.delete(permanent=True) artifact1.delete(permanent=True) artifact2.delete(permanent=True) def test_mapped(adata, adata2): # prepare test data adata.strings_to_categoricals() adata.obs["feat2"] = adata.obs["feat1"] adata.layers["layer1"] = adata.X.copy() adata.layers["layer1"][0, 0] = 0 artifact1 = ln.Artifact.from_anndata(adata, key="part_one.h5ad").save() adata2.X = csr_matrix(adata2.X) adata2.layers["layer1"] = adata2.X.copy() adata2.obs["feat2"] = adata2.obs["feat1"] artifact2 = ln.Artifact.from_anndata( adata2, key="part_two.zarr", format="zarr" ).save() adata3 = adata2.copy() adata3.var_names = ["A", "B", "C"] adata3.obs.loc["0", "feat1"] = np.nan artifact3 = ln.Artifact.from_anndata(adata3, key="other_vars.h5ad").save() adata4 = adata.copy() adata4.layers["layer1"] = csc_matrix(adata4.layers["layer1"]) artifact4 = ln.Artifact.from_anndata(adata4, description="csc layer").save() collection_outer = ln.Collection( [artifact1, artifact2, artifact3], key="gather_outer" ).save() collection_csc = ln.Collection([artifact4, artifact2], key="check_csc").save() collection = ln.Collection([artifact1, artifact2], key="gather") # test mapped without saving first with collection.mapped() as ls_ds: assert ls_ds.__class__.__name__ == "MappedCollection" collection.save() # test encoders with pytest.raises(ValueError): ls_ds = collection.mapped(encode_labels=["feat1"]) with pytest.raises(ValueError): ls_ds = collection.mapped(obs_keys="feat1", encode_labels=["feat3"]) with pytest.raises(ValueError): ls_ds = collection.mapped(obs_keys="feat1", unknown_label={"feat3": "Unknown"}) with collection.mapped(obs_keys=["feat1", "feat2"], unknown_label="A") as ls_ds: assert ls_ds.encoders["feat1"]["A"] == -1 assert ls_ds.encoders["feat1"]["B"] == 0 assert ls_ds.encoders["feat2"]["A"] == -1 assert ls_ds.encoders["feat2"]["B"] == 0 assert ls_ds[0]["feat1"] == -1 assert ls_ds[1]["feat1"] == 0 assert ls_ds[0]["feat2"] == -1 assert ls_ds[1]["feat2"] == 0 with collection.mapped( obs_keys=["feat1", "feat2"], unknown_label={"feat1": "A"} ) as ls_ds: assert ls_ds.encoders["feat1"]["A"] == -1 assert ls_ds.encoders["feat1"]["B"] == 0 # categories in the encoder are sorted A_enc = ls_ds.encoders["feat2"]["A"] assert A_enc == 0 B_enc = ls_ds.encoders["feat2"]["B"] assert B_enc == 1 assert ls_ds[0]["feat1"] == -1 assert ls_ds[1]["feat1"] == 0 assert ls_ds[0]["feat2"] == A_enc assert ls_ds[1]["feat2"] == B_enc with collection.mapped( obs_keys=["feat1", "feat2"], unknown_label="A", encode_labels=["feat1"] ) as ls_ds: assert ls_ds.encoders["feat1"]["A"] == -1 assert ls_ds.encoders["feat1"]["B"] == 0 assert "feat2" not in ls_ds.encoders assert ls_ds[0]["feat1"] == -1 assert ls_ds[1]["feat1"] == 0 assert ls_ds[0]["feat2"] == "A" assert ls_ds[1]["feat2"] == "B" ls_ds = collection.mapped(obs_keys="feat1") assert not ls_ds.closed assert len(ls_ds) == 4 assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3 assert len(ls_ds[0]["X"]) == 3 assert np.array_equal(ls_ds[2]["X"], np.array([1, 2, 5])) weights = ls_ds.get_label_weights("feat1") assert len(weights) == 4 assert all(weights == 0.5) weights = ls_ds.get_label_weights(["feat1", "feat2"]) assert len(weights) == 4 assert all(weights == 0.5) weights = ls_ds.get_label_weights(["feat1", "feat2"], scaler=1.0) assert all(weights == 1.0 / 3.0) weights = ls_ds.get_label_weights( ["feat1", "feat2"], scaler=1.0, return_categories=True ) assert weights["A__A"] == 1.0 / 3.0 assert weights["B__B"] == 1.0 / 3.0 assert not ls_ds.check_vars_sorted(ascending=True) assert not ls_ds.check_vars_sorted(ascending=False) assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == [] ls_ds.var_list = None assert not ls_ds.check_vars_sorted() ls_ds.var_list = None assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == [] ls_ds.close() assert ls_ds.closed del ls_ds with collection.mapped(obs_keys="feat1", join="inner", dtype="float32") as ls_ds: assert not ls_ds.closed assert len(ls_ds) == 4 assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3 assert str(ls_ds[0]["X"].dtype) == "float32" assert str(ls_ds[2]["X"].dtype) == "float32" assert ls_ds.closed ls_ds = collection.mapped(obs_keys="feat1", parallel=True) assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3 assert ls_ds[0]["_store_idx"] == 0 assert ls_ds[2]["_store_idx"] == 1 ls_ds = collection.mapped( layers_keys=["layer1"], obsm_keys=["X_pca"], obs_keys="feat1" ) assert np.array_equal(ls_ds[0]["layer1"], np.array([0, 2, 3])) assert np.array_equal(ls_ds[2]["layer1"], np.array([1, 2, 5])) assert np.array_equal(ls_ds[2]["obsm_X_pca"], np.array([1, 2])) assert np.array_equal(ls_ds[3]["obsm_X_pca"], np.array([3, 4])) assert ls_ds.shape == (4, 3) assert ls_ds.original_shapes[0] == (2, 3) and ls_ds.original_shapes[1] == (2, 3) ls_ds.close() # keys not present in a store are ignored (omitted from output) with collection.mapped( obs_keys=["feat1", "feat_missing"], obsm_keys=["X_pca", "X_missing"], layers_keys=["X", "raw.X"], ) as ls_ds: assert len(ls_ds) == 4 ls_ds_idx = ls_ds[0] assert ls_ds_idx["X"].shape == (3,) assert ls_ds_idx["raw.X"].shape == (4,) assert "feat1" in ls_ds_idx assert "feat_missing" not in ls_ds_idx assert "obsm_X_pca" in ls_ds_idx assert "obsm_X_missing" not in ls_ds_idx assert "raw.X" not in ls_ds[2] # test with QuerySet query_set = ln.Artifact.filter(key__in=["part_one.h5ad", "part_two.zarr"]) with query_set.mapped() as ls_ds: assert ls_ds.shape == (4, 3) with query_set.order_by("created_at").mapped(stream=True) as ls_ds: assert ls_ds.shape == (4, 3) with collection.mapped(obs_keys="feat1", stream=True) as ls_ds: assert len(ls_ds[0]) == 3 and len(ls_ds[2]) == 3 with pytest.raises(ValueError): with collection_outer.mapped(obs_keys="feat1", join="inner"): pass with collection_outer.mapped( layers_keys="X", obsm_keys="X_pca", obs_keys="feat1", join="outer" ) as ls_ds: assert ls_ds.shape == (6, 6) assert ls_ds.join_vars == "outer" assert len(ls_ds.var_joint) == 6 assert len(ls_ds[0]) == 4 assert len(ls_ds[0]["X"]) == 6 assert np.array_equal(ls_ds[0]["X"], np.array([0, 0, 0, 3, 1, 2])) assert np.array_equal(ls_ds[1]["X"], np.array([0, 0, 0, 6, 4, 5])) assert np.array_equal(ls_ds[2]["X"], np.array([0, 0, 0, 5, 1, 2])) assert np.array_equal(ls_ds[3]["X"], np.array([0, 0, 0, 8, 4, 5])) ls_ds_idx = ls_ds[4] assert np.array_equal(ls_ds_idx["X"], np.array([1, 2, 5, 0, 0, 0])) assert ls_ds_idx["feat1"] is np.nan assert np.array_equal(ls_ds[5]["X"], np.array([4, 5, 8, 0, 0, 0])) assert np.issubdtype(ls_ds[2]["X"].dtype, np.integer) assert np.issubdtype(ls_ds[4]["X"].dtype, np.integer) assert np.array_equal(ls_ds[3]["obsm_X_pca"], np.array([3, 4])) assert ls_ds.check_vars_non_aligned(["MYC", "TCF7", "GATA1"]) == [2] assert not ls_ds.check_vars_sorted() assert len(ls_ds.get_label_weights("feat1")) == 6 with collection_outer.mapped(layers_keys="layer1", join="outer") as ls_ds: assert np.array_equal(ls_ds[0]["layer1"], np.array([0, 0, 0, 3, 0, 2])) assert np.array_equal(ls_ds[4]["layer1"], np.array([1, 2, 5, 0, 0, 0])) # csc matrix in layers with pytest.raises(ValueError): collection_csc.mapped(layers_keys="layer1") # test with obs_filter # tuple as obs_filter is deprecated, test anyways for now with collection.mapped(obs_filter=("feat1", ("A", "B"))) as ls_ds: assert ls_ds.shape == (4, 3) assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 6])) assert np.array_equal(ls_ds[3]["X"], np.array([4, 5, 8])) weights = ls_ds.get_label_weights("feat1") assert len(weights) == 4 assert all(weights == 0.5) # tuple as obs_filter is deprecated, test anyways for now with collection.mapped(obs_filter=("feat1", "B")) as ls_ds: assert ls_ds.shape == (2, 3) assert np.array_equal(ls_ds[0]["X"], np.array([4, 5, 6])) assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 8])) weights = ls_ds.get_label_weights("feat2") assert len(weights) == 2 assert all(weights == 0.5) with collection.mapped(obs_filter={"feat1": "B", "feat2": ("A", "B")}) as ls_ds: assert ls_ds.shape == (2, 3) assert ls_ds.original_shapes == [(1, 3), (1, 3)] assert np.array_equal(ls_ds[0]["X"], np.array([4, 5, 6])) assert np.array_equal(ls_ds[1]["X"], np.array([4, 5, 8])) weights = ls_ds.get_label_weights("feat2") assert len(weights) == 2 assert all(weights == 0.5) # nan in filtering values with collection_outer.mapped(obs_filter={"feat1": np.nan}, join="outer") as ls_ds: assert ls_ds.shape == (1, 6) assert np.array_equal(ls_ds[0]["X"], np.array([1, 2, 5, 0, 0, 0])) with collection_outer.mapped( obs_filter={"feat1": (np.nan,), "feat2": ["A", "B"]}, join="outer" ) as ls_ds: assert ls_ds.shape == (1, 6) with collection_outer.mapped( obs_filter={"feat1": (np.nan, "A", "B")}, join="outer" ) as ls_ds: assert ls_ds.shape == (6, 6) with collection_outer.mapped( obs_filter={"feat1": ["A", "B"]}, join="outer" ) as ls_ds: assert ls_ds.shape == (5, 6) with collection_outer.mapped( obs_filter={"feat1": ("A", np.nan)}, join="outer" ) as ls_ds: assert ls_ds.shape == (3, 6) collection.delete(permanent=True) collection_outer.delete(permanent=True) collection_csc.delete(permanent=True) artifact1.delete(permanent=True) artifact2.delete(permanent=True) artifact3.delete(permanent=True) artifact4.delete(permanent=True) def test_revise_collection(df, adata): # create a versioned collection artifact = ln.Artifact.from_dataframe(df, description="test").save() collection = ln.Collection(artifact, key="test-collection", version="1") assert collection.version_tag == "1" assert collection.version == "1" assert collection.uid.endswith("0000") collection.save() artifact = ln.Artifact.from_anndata(adata, description="test").save() with pytest.raises(ValueError) as error: collection_r2 = ln.Collection(artifact, revises=collection, version="1") assert ( error.exconly() == "ValueError: Please change the version tag or leave it `None`, '1' is already taken" ) with pytest.raises(TypeError): ln.Collection(adata, revises="wrong-type") # create new collection from old collection collection_r2 = ln.Collection(artifact, key="test-collection") assert collection_r2.stem_uid == collection.stem_uid assert collection_r2.uid.endswith("0001") # repeat collection_r2 = ln.Collection(artifact, key="test-collection") assert collection_r2.stem_uid == collection.stem_uid assert collection_r2.uid.endswith("0001") assert collection_r2.version_tag is None assert ( collection_r2.version == collection_r2.uid[-4:] ) # version falls back to uid suffix assert collection_r2.key == "test-collection" collection_r2.save() # create new collection from newly versioned collection df.iloc[0, 0] = 0 artifact = ln.Artifact.from_dataframe(df, description="test") artifact.save() collection_r3 = ln.Collection( artifact, key="test-collection", description="test description3", version="2", ) assert collection_r3.stem_uid == collection.stem_uid assert collection_r3.version_tag == "2" assert collection_r3.version == "2" assert collection_r3.uid.endswith("0002") assert collection_r3.key == "test-collection" assert collection_r3.description == "test description3" artifacts_r2 = collection_r2.artifacts.all() collection_r2.delete(permanent=True) artifacts_r2.delete(permanent=True) artifacts = collection.artifacts.all() collection.delete(permanent=True) artifacts.delete(permanent=True) def test_collection_append(df, adata): artifact = ln.Artifact.from_dataframe(df, description="test").save() artifact_1 = ln.Artifact.from_anndata(adata, description="test").save() collection = ln.Collection(artifact, key="Test", description="Test append").save() new_collection = collection.append(artifact_1).save() assert new_collection.key == collection.key assert new_collection.description == collection.description assert new_collection.uid.endswith("0001") artifacts = new_collection.artifacts.all() assert len(artifacts) == 2 new_collection.versions.delete(permanent=True) artifacts.delete(permanent=True) def test_with_metadata(df, adata): meta_artifact = ln.Artifact.from_dataframe(df, description="test") meta_artifact.save() data_artifact = ln.Artifact.from_anndata(adata, description="test adata") data_artifact.save() collection = ln.Collection( data_artifact, key="test collection", meta_artifact=meta_artifact ) collection.save() assert collection.meta_artifact == meta_artifact assert collection.data_artifact == data_artifact collection.delete(permanent=True) data_artifact.delete(permanent=True) meta_artifact.delete(permanent=True) def test_collection_get_tracking(df): artifact = ln.Artifact.from_dataframe(df, key="df.parquet").save() collection = ln.Collection(artifact, key="track-collection").save() transform = ln.Transform(key="test track collection via get").save() run = ln.Run(transform).save() assert ( ln.Collection.get(key="track-collection", is_run_input=run) in run.input_collections.all() ) collection.delete(permanent=True) artifact.delete(permanent=True) transform.delete(permanent=True) def test_describe_collection(adata, capsys): artifact = ln.Artifact(adata, description="test").save() collection = ln.Collection(artifact, key="test").save() collection.describe() captured = capsys.readouterr() assert len(captured.out) > 50 assert "collection" in captured.out.lower() # test describing from a remote postgres instance with less modules collection = ln.Collection.connect("laminlabs/lamin-dev").first() collection.describe() captured = capsys.readouterr() assert len(captured.out) > 50 assert "collection" in captured.out.lower() ================================================ FILE: tests/core/test_curator_basics.py ================================================ import re import textwrap import bionty as bt import lamindb as ln import pandas as pd import pytest from lamindb.core.exceptions import ValidationError def _strip_ansi(text: str) -> str: """Remove ANSI escape sequences from a string.""" ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])") return ansi_escape.sub("", text) @pytest.fixture def df() -> pd.DataFrame: return pd.DataFrame( { "sample_id": ["sample1", "sample2"], "sample_name": ["Sample 1", "Sample 2"], "sample_type": ["Type A", "Type B"], } ) @pytest.fixture def df_missing_sample_type_column() -> pd.DataFrame: return pd.DataFrame( { "sample_id": ["sample1", "sample2"], "sample_name": ["Sample 1", "Sample 2"], } ) @pytest.fixture def df_missing_sample_name_column() -> pd.DataFrame: return pd.DataFrame( { "sample_id": ["sample1", "sample2"], "sample_type": ["Type A", "Type B"], } ) @pytest.fixture def df_changed_col_order() -> pd.DataFrame: return pd.DataFrame( { "sample_name": ["Sample 1", "Sample 2"], "sample_type": ["Type A", "Type B"], "sample_id": ["sample1", "sample2"], } ) @pytest.fixture def df_extra_column() -> pd.DataFrame: return pd.DataFrame( { "sample_id": ["sample1", "sample2"], "sample_name": ["Sample 1", "Sample 2"], "sample_type": ["Type A", "Type B"], "extra_column": ["Extra 1", "Extra 2"], } ) @pytest.fixture def df_disease() -> pd.DataFrame: return pd.DataFrame( { "disease": pd.Categorical( [ # Only after 2025 mondo "HDAC4-related haploinsufficiency syndrome", "SAMD9L-related spectrum and myeloid neoplasm risk", # Already before 2025 mondo "essential hypertension", "essential hypertension", "asthma", ] ), } ) @pytest.fixture def disease_ontology_old() -> bt.Source: return bt.Disease.add_source( bt.Source.connect("laminlabs/bionty-assets") .get(entity="bionty.Disease", version="2024-08-06", organism="all") .save() ) @pytest.fixture(scope="module") def lists_df(): return pd.DataFrame( { "sample_id": [["sample1", "sample2"], ["sample2"], ["sample3"]], "dose": [[1.2, 2.3], [1.2], [2.3]], "cell_type": [["B cell", "T cell"], ["B cell"], ["T cell"]], "tissue": [["blood", "pulmo"], ["blood"], ["lung"]], } ) @pytest.fixture(scope="module") def cat_df(): return pd.DataFrame( { "sample_id": [["sample1", "sample2"], ["sample2"], ["sample3"]], "dose": [[1.2, 2.3], [1.2], [2.3]], "cell_type": [["B cell", "T cell"], ["B cell"], ["T cell"]], "tissue": ["blood", "blood", "lung"], } ) def test_curator_df_multivalue(lists_df, cat_df): feature1 = ln.Feature(name="sample_id", dtype=list[str]).save() feature2 = ln.Feature(name="dose", dtype=list[float]).save() feature3 = ln.Feature(name="cell_type", dtype=list[str]).save() feature4 = ln.Feature(name="tissue", dtype=list[bt.Tissue]).save() schema = ln.Schema( name="lists schema cat", features=[ feature1, feature2, feature3, feature4, ], ).save() curator = ln.curators.DataFrameCurator(lists_df, schema) with pytest.raises(ValidationError): curator.validate() assert curator.cat._cat_vectors.keys() == {"columns", "tissue"} assert curator.cat._cat_vectors["tissue"]._validated == ["blood", "lung"] assert curator.cat._cat_vectors["tissue"]._non_validated == ["pulmo"] assert curator.cat._cat_vectors["tissue"]._synonyms == {"pulmo": "lung"} curator.cat.standardize("tissue") assert curator.cat._cat_vectors["tissue"]._non_validated == [] assert lists_df["tissue"].tolist() == [["blood", "lung"], ["blood"], ["lung"]] assert curator.validate() is None # test with cat_df which has a non-list tissue curator = ln.curators.DataFrameCurator(cat_df, schema) with pytest.raises(ValidationError): curator.validate() schema.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) feature3.delete(permanent=True) feature4.delete(permanent=True) def test_curators_list_feature_nullable_empty_list(): """Test that a list feature that is nullable can accept empty lists.""" feature_list = ln.Feature( name="list_tissue", dtype=list[bt.Tissue.ontology_id], nullable=True ).save() feature_int = ln.Feature(name="feature int", dtype=int, nullable=True).save() schema = ln.Schema( name="test_list_feature_schema", features=[feature_list, feature_int], coerce=True, ).save() df = pd.DataFrame({"list_tissue": [], "feature int": []}) ln.curators.DataFrameCurator(df, schema).validate() # clean up schema.delete(permanent=True) feature_list.delete(permanent=True) feature_int.delete(permanent=True) def test_curator__repr__(df): feature = ln.Feature(name="sample_id", dtype="str").save() schema = ln.Schema( name="sample schema", features=[feature], ).save() curator = ln.curators.DataFrameCurator(df, schema) expected_repr = textwrap.dedent("""\ DataFrameCurator(Schema: sample schema, unvalidated) """).strip() actual_repr = _strip_ansi(repr(curator)) print(actual_repr) assert actual_repr.strip() == expected_repr.strip() schema.delete(permanent=True) feature.delete(permanent=True) @pytest.mark.parametrize( "model_class", [ln.ULabel, ln.Record], ) def test_df_curator_typed_categorical(model_class): # root level sample_root_type = model_class(name="Sample", is_type=True).save() for name in ["s1", "s2"]: model_class(name=name, type=sample_root_type).save() # lab A level lab_a_type = model_class(name="LabA", is_type=True).save() sample_a_type = model_class(name="Sample", is_type=True, type=lab_a_type).save() for name in ["s3", "s4"]: model_class(name=name, type=sample_a_type).save() # lab B level lab_b_type = model_class(name="LabB", is_type=True).save() sample_b_type = model_class(name="Sample", is_type=True, type=lab_b_type).save() for name in ["s5", "s6"]: model_class(name=name, type=sample_b_type).save() df = pd.DataFrame( { "biosample_name": pd.Categorical(["s1", "s2", "s3", "s4", "s5", "s6"]), } ) feature = ln.Feature(name="biosample_name", dtype=sample_a_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert "4 terms not validated in feature 'biosample_name':" in error.exconly() assert set(curator.cat._cat_vectors["biosample_name"]._validated) == { "s3", "s4", } assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == { "s1", "s2", "s5", "s6", } # Move LabB under LabA lab_b_type.type = lab_a_type lab_b_type.save() feature.delete(permanent=True) # re-create the feature with the new dtype feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert set(curator.cat._cat_vectors["biosample_name"]._validated) == { "s3", "s4", "s5", "s6", } assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == { "s1", "s2", } # Lab at the root feature.delete(permanent=True) # re-create the feature with the new dtype feature = ln.Feature(name="biosample_name", dtype=sample_root_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert set(curator.cat._cat_vectors["biosample_name"]._validated) == { "s1", "s2", } assert set(curator.cat._cat_vectors["biosample_name"]._non_validated) == { "s3", "s4", "s5", "s6", } attribute = model_class.__name__.lower() + "s" getattr(sample_a_type, attribute).all().delete(permanent=True) getattr(sample_b_type, attribute).all().delete(permanent=True) getattr(lab_b_type, attribute).all().delete(permanent=True) getattr(lab_a_type, attribute).all().delete(permanent=True) lab_a_type.delete(permanent=True) lab_b_type.delete(permanent=True) getattr(sample_root_type, attribute).all().delete(permanent=True) sample_root_type.delete(permanent=True) feature.delete(permanent=True) def test_df_curator_same_name_at_different_levels_involving_root(): s1_root = ln.Record(name="s1").save() lab_a_type = ln.Record(name="LabA", is_type=True).save() s1_lab_a = ln.Record(name="s1", type=lab_a_type).save() df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])}) # feature constraining to lab_a_type feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_lab_a # feature constraining to root feature.delete(permanent=True) feature = ln.Feature(name="biosample_name", dtype=ln.Record).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_root feature.delete(permanent=True) s1_root.delete(permanent=True) s1_lab_a.delete(permanent=True) lab_a_type.delete(permanent=True) def test_df_curator_same_name_at_different_levels_below_root(): department_a_type = ln.Record(name="DepartmentA", is_type=True).save() s1_department_a = ln.Record(name="s1", type=department_a_type).save() lab_a_type = ln.Record(name="LabA", is_type=True, type=department_a_type).save() s1_lab_a = ln.Record(name="s1", type=lab_a_type).save() df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])}) # feature constraining to lab_a_type feature = ln.Feature(name="biosample_name", dtype=lab_a_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_lab_a # feature constraining to department_a_type feature.delete(permanent=True) feature = ln.Feature(name="biosample_name", dtype=department_a_type).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_department_a feature.delete(permanent=True) s1_department_a.delete(permanent=True) s1_lab_a.delete(permanent=True) lab_a_type.delete(permanent=True) department_a_type.delete(permanent=True) def test_df_curator_same_name_at_same_level(): # below root level lab_a_type = ln.Record(name="LabA", is_type=True).save() record_1 = ln.Record(name="s1", type=lab_a_type).save() lab_b_type = ln.Record(name="LabB", is_type=True).save() record_2 = ln.Record(name="s1", type=lab_b_type).save() df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])}) feature = ln.Feature(name="biosample_name", dtype=ln.Record).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( "Ambiguous match for Record 's1': found 2 records at depth 1 (under types: ['LabA', 'LabB'])" in error.exconly() ) # at root level record_1.type = None record_1.save() record_2.type = None record_2.save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( "Ambiguous match for Record 's1': found 2 root-level records" in error.exconly() ) feature.delete(permanent=True) record_1.delete(permanent=True) lab_a_type.delete(permanent=True) record_2.delete(permanent=True) lab_b_type.delete(permanent=True) # also see test_features_name_duplicates_across_equal_levels def test_curator_schema_feature_mapping(): lab_a_type = ln.Feature(name="LabA", is_type=True).save() feature1 = ln.Feature(name="sample_name", dtype="str", type=lab_a_type).save() lab_b_type = ln.Feature(name="LabB", is_type=True).save() feature2 = ln.Feature(name="sample_name", dtype="str", type=lab_b_type).save() schema = ln.Schema([feature1], name="Lab A schema").save() df = pd.DataFrame({"sample_name": ["s1", "s2"]}) curator = ln.curators.DataFrameCurator(df, schema) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["columns"] assert len(cat_vector.records) == 1 assert len(cat_vector._validated) == 1 schema.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) lab_a_type.delete(permanent=True) lab_b_type.delete(permanent=True) def test_dtypes_at_different_levels(ccaplog): sample_type_root = ln.Record(name="Sample", is_type=True).save() lab_a_type = ln.Record(name="LabA", is_type=True).save() sample_type_a = ln.Record(name="Sample", is_type=True, type=lab_a_type).save() s1_lab_a = ln.Record(name="s1", type=sample_type_a).save() df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])}) feature = ln.Feature(name="biosample_name", dtype=sample_type_root).save() schema = ln.Schema(features=[feature]).save() sample_type_root.delete() df = pd.DataFrame({"biosample_name": pd.Categorical(["s1"])}) # UID-based lookup can find records in trash, so curator creation should succeed # but a warning should be printed curator = ln.curators.DataFrameCurator(df, schema) assert "from trash" in ccaplog.text schema.delete(permanent=True) sample_type_root.restore() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert "1 term not validated in feature 'biosample_name': 's1'" in error.exconly() s1_root = ln.Record(name="s1", type=sample_type_root).save() curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_root # update feature dtype feature.delete(permanent=True) feature = ln.Feature(name="biosample_name", dtype=sample_type_a).save() curator = ln.curators.DataFrameCurator(df, ln.examples.schemas.valid_features()) curator.validate() cat_vector = curator._atomic_curator.cat._cat_vectors["biosample_name"] assert cat_vector._validated == ["s1"] assert len(cat_vector.records) == 1 assert cat_vector.records[0] == s1_lab_a feature.delete(permanent=True) s1_lab_a.delete(permanent=True) sample_type_a.delete(permanent=True) lab_a_type.delete(permanent=True) s1_root.delete(permanent=True) sample_type_root.delete(permanent=True) def test_nullable(): disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save() schema = ln.Schema(features=[disease]).save() dataset = {"disease": pd.Categorical([pd.NA, "asthma"])} df = pd.DataFrame(dataset) curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as err: curator.validate() assert "non-nullable series 'disease' contains null values" in err.exconly() # make feature nullable # (needs to throw an error if already datasets were validated with it) disease.nullable = True disease.save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises( ValidationError, # match=re.escape("1 term is not validated: 'asthma'"), # TODO: need the message ): curator.validate() schema.delete(permanent=True) disease.delete(permanent=True) def test_pandera_dataframe_schema( df, df_missing_sample_type_column, df_changed_col_order, df_extra_column, df_missing_sample_name_column, ): # schemas schema_all_required = ln.Schema( name="my-schema all required", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save(), ln.Feature(name="sample_type", dtype=str).save(), ], ).save() schema_maximal_set = ln.Schema( name="my-schema maximal_set", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save(), ln.Feature(name="sample_type", dtype=str).save(), ], minimal_set=False, maximal_set=True, ).save() schema_ordered_set = ln.Schema( name="my-schema ordered_set", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save(), ln.Feature(name="sample_type", dtype=str).save(), ], ordered_set=True, ).save() # minimal_set=True, all three columns are required ln.curators.DataFrameCurator(df, schema=schema_all_required).validate() # can't miss a required column with pytest.raises(ValidationError): ln.curators.DataFrameCurator( df_missing_sample_type_column, schema=schema_all_required ).validate() # doesn't care about order ln.curators.DataFrameCurator( df_changed_col_order, schema=schema_all_required ).validate() # extra column is fine ln.curators.DataFrameCurator(df_extra_column, schema=schema_all_required).validate() # maximal_set=True, extra column is *not* allowed # check that __lamindb values are OK df["__lamindb_record_uid__"] = "some_value" ln.curators.DataFrameCurator(df, schema=schema_maximal_set).validate() del df["__lamindb_record_uid__"] with pytest.raises(ValidationError): ln.curators.DataFrameCurator( df_extra_column, schema=schema_maximal_set, # extra column is not allowed ).validate() # minimal_set=False, missing column is allowed ln.curators.DataFrameCurator( df_missing_sample_type_column, schema=schema_maximal_set ).validate() # ordered_set=True, order matters with pytest.raises(ValidationError): ln.curators.DataFrameCurator( df_changed_col_order, schema=schema_ordered_set ).validate() # a feature is optional schema_optional_sample_name = ln.Schema( name="my-schema optional sample_name", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True), ln.Feature(name="sample_type", dtype=str).save(), ], ).save() # missing required "sample_type" column raises an error with pytest.raises(ValidationError): ln.curators.DataFrameCurator( df_missing_sample_type_column, schema=schema_optional_sample_name, ).validate() # missing optional column "sample_name" is fine ln.curators.DataFrameCurator( df_missing_sample_name_column, schema=schema_optional_sample_name ).validate() # clean up ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_schema_not_saved(df): """Attempting to validate an unsaved Schema must error.""" feature = ln.Feature(name="cell_type", dtype=str).save() schema = ln.Schema(features=[feature]) with pytest.raises(ValueError) as excinfo: ln.curators.DataFrameCurator(df, schema) assert excinfo.exconly() == ( "ValueError: Schema must be saved before curation. Please save it using '.save()'." ) def test_schema_artifact_annotated(df): """A passed Artifact should be annotated with a Schema if successfully curated.""" af = ln.Artifact.from_dataframe(df, key="test.parquet").save() schema = ln.Schema( name="sample schema", features=[ln.Feature(name="sample_id", dtype="str").save()], ).save() curator = ln.curators.DataFrameCurator(af, schema) curator.validate() curator.save_artifact() af_queried = ln.Artifact.filter(key="test.parquet").one() assert af_queried.schema is not None # clean up af.delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_schema_optionals(): schema = ln.Schema( name="my-schema", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True), ln.Feature(name="sample_type", dtype=str).save(), ], ).save() assert schema.optionals.get().to_list("name") == [ "sample_name", ] # set sample_type to optional with pytest.raises( TypeError, match=re.escape("features must be a list of Feature records!"), ): schema.optionals.set("test") schema.optionals.set([ln.Feature.get(name="sample_type")]) assert schema.optionals.get().to_list("name") == ["sample_type"] # add sample_name to optionals with pytest.raises( TypeError, match=re.escape("features must be a list of Feature records!"), ): schema.optionals.add("test") schema.optionals.add(ln.Feature.get(name="sample_name")) assert schema.optionals.get().to_list("name") == ["sample_name", "sample_type"] # clean up ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_schema_ordered_set(df): # create features with a different order so that sample_id is not the first ln.Feature(name="sample_name", dtype=str).save() ln.Feature(name="sample_type", dtype=str).save() ln.Feature(name="sample_id", dtype=str).save() # create an ordered schema with sample_id as the first feature schema = ln.Schema( name="my-schema", features=[ ln.Feature(name="sample_id", dtype=str).save(), ln.Feature(name="sample_name", dtype=str).save(), ln.Feature(name="sample_type", dtype=str).save(), ], ordered_set=True, ).save() assert ln.curators.DataFrameCurator(df, schema=schema).validate() is None # clean up ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) @pytest.mark.parametrize("minimal_set", [True, False]) def test_schema_minimal_set_var_allowed(minimal_set): """Independent of the value of minimal_set, invalid ensembl gene IDs are allowed.""" adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") adata.var_names = [adata.var_names[0], adata.var_names[1], "NOT_VALID_ENSEMBL"] var_schema = ln.Schema( itype=bt.Gene.ensembl_gene_id, minimal_set=minimal_set, ).save() schema = ln.Schema(otype="AnnData", slots={"var.T": var_schema}).save() curator = ln.curators.AnnDataCurator(adata, schema) curator.validate() # clean up schema.delete(permanent=True) def test_schema_maximal_set_var(): """If maximal_set is True, invalid ensembl gene IDs are not allowed.""" adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") adata.var_names = [adata.var_names[0], adata.var_names[1], "NOT_VALID_ENSEMBL"] var_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save() schema = ln.Schema(otype="AnnData", slots={"var.T": var_schema}).save() curator = ln.curators.AnnDataCurator(adata, schema) with pytest.raises(ValidationError) as error: curator.validate() assert error.exconly() == ( "lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot 'var.T': 'NOT_VALID_ENSEMBL'\n" " → fix typos, remove non-existent values, or save terms via: curator.slots['var.T'].cat.add_new_from('columns')" ) # clean up schema.delete(permanent=True) def test_feature_dtype_path(): df = pd.DataFrame( { "sample": ["Sample_X", "Sample_Y", "Sample_Y"], "fastq_1": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz", ], "fastq_2": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz", ], "expected_cells": [5000, 5000, 5000], } ) nextflow_schema = ln.Schema( name="nf-core/scrnaseq pipeline - params.input schema", description="https://github.com/nf-core/scrnaseq/blob/4.0.0/assets/schema_input.json", features=[ ln.Feature( name="sample", dtype="str", nullable=False, description="Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (_).", ).save(), ln.Feature( name="fastq_1", dtype="path", nullable=False, description="Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.", ).save(), ln.Feature( name="fastq_2", dtype="path", description="Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension “.fastq.gz” or “.fq.gz”.", ).save(), ln.Feature( name="expected_cells", dtype=int, description="Number of cells expected for a sample. Must be an integer. If multiple rows are provided for the same sample, this must be the same number for all rows, i.e. the total number of expected cells for the sample.", ).save(), ln.Feature( name="seq_center", dtype=str, description="Sequencing center for the sample. If multiple rows are provided for the same sample, this must be the same string for all rows. Samples sequenced at different centers are considered different samples and must have different identifiers.", ).save(), ln.Feature( name="sample_type", dtype=str, description='"atac", "gex"', ).save(), ln.Feature( name="feature_type", dtype=str, description='"gex", "vdj", "ab", "crispr", "cmo"', ).save(), ], ).save() nextflow_schema.optionals.set( [ ln.Feature.get(name="expected_cells"), ln.Feature.get(name="seq_center"), ln.Feature.get(name="sample_type"), ln.Feature.get(name="feature_type"), ] ) curator = ln.curators.DataFrameCurator(df, schema=nextflow_schema) assert curator.validate() is None # clean up nextflow_schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_cat_filters_specific_source_uid(df_disease, disease_ontology_old): """Specific source_uid passed to the `cat_filters`""" feature = ln.Feature( name="disease", dtype=bt.Disease, cat_filters={"source__uid": disease_ontology_old.uid}, ).save() schema = ln.Schema([feature], name="test schema").save() curator = ln.curators.DataFrameCurator(df_disease, schema) try: curator.validate() except ln.errors.ValidationError as error: assert ( "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'" in str(error) ) schema.delete(permanent=True) feature.delete(permanent=True) def test_cat_filters_specific_source(df_disease, disease_ontology_old): """Specific Source record passed to the `cat_filters`""" feature = ln.Feature( name="disease", dtype=bt.Disease, cat_filters={"source": disease_ontology_old}, ).save() schema = ln.Schema([feature], name="test schema").save() curator = ln.curators.DataFrameCurator(df_disease, schema) try: curator.validate() except ln.errors.ValidationError as error: assert ( "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'" in str(error) ) schema.delete(permanent=True) feature.delete(permanent=True) def test_cat_filters_multiple_relation_filters(df_disease, disease_ontology_old): """Multiple relation filters in cat_filters""" # TODO: needs to also work if both filters are from the same related model!!! feature = ln.Feature( name="disease", dtype=bt.Disease, cat_filters={ "source__uid": disease_ontology_old.uid, "created_by__handle": ln.setup.settings.user.handle, }, ).save() schema = ln.Schema([feature], name="test schema").save() curator = ln.curators.DataFrameCurator(df_disease, schema) try: curator.validate() except ln.errors.ValidationError as error: assert ( "2 terms not validated in feature 'disease': 'HDAC4-related haploinsufficiency syndrome', 'SAMD9L-related spectrum and myeloid neoplasm risk'" in str(error) ) schema.delete(permanent=True) feature.delete(permanent=True) def test_curate_columns(df): """Test that columns can be curated.""" schema = ln.Schema( name="sample schema", features=[ ln.Feature(name="sample_id", dtype="str").save(), ln.Feature(name="sample_name", dtype="str").save(), ln.Feature(name="sample_type", dtype="str").save(), ], ).save() # make one column name invalid df.rename(columns={"sample_name": "sample_name_name"}, inplace=True) curator = ln.curators.DataFrameCurator(df, schema) try: curator.validate() except ln.errors.ValidationError as error: assert "column 'sample_name' not in dataframe" in str(error) # now fix the column df.rename(columns={"sample_name_name": "sample_name"}, inplace=True) curator.validate() schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_wrong_datatype(df): feature = ln.Feature(name="sample_id", dtype=ln.ULabel).save() schema = ln.Schema(features=[feature]).save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as excinfo: curator.validate() assert "expected series 'sample_id' to have type category, got object" in str( excinfo.value ) assert ( "Hint: Consider setting `feature.coerce = True` to attempt coercing values during validation to the required dtype." in str(excinfo.value) ) schema.delete(permanent=True) feature.delete(permanent=True) def test_hash_index_feature(df): df_index = df.set_index("sample_id") sample_name = ln.Feature(name="sample_name", dtype="str").save() sample_name.uid = "OpQAD5Ifu89t" sample_name.save() sample_type = ln.Feature(name="sample_type", dtype="str").save() sample_type.uid = "7I4u69RiCAVy" sample_type.save() sample_id = ln.Feature(name="sample_id", dtype="str").save() sample_id.uid = "uValv1YfEQib" sample_id.save() schema_index = ln.Schema( name="sample schema with index", features=[ sample_name, sample_type, ], index=sample_id, ).save() assert schema_index.hash == "drtQMP4N4xEebS49DO-9Jw" schema = ln.Schema( name="sample schema", features=[ sample_id, sample_name, sample_type, ], ).save() assert schema.hash == "Z_dmk1WendD15s2FyBW1HA" artifact = ln.Artifact.from_dataframe( df_index, key="curated_df.parquet", schema=schema_index ).save() assert artifact.schemas.all().one() == schema_index # clean up artifact.delete(permanent=True) schema_index.delete(permanent=True) schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_add_new_from_subtype(df): """Test that add_new_from works with subtypes.""" sample_type = ln.Record(name="SampleType", is_type=True).save() ln.Record(name="Type A", type=sample_type).save() schema = ln.Schema( name="sample schema", features=[ ln.Feature(name="sample_id", dtype="str").save(), ln.Feature(name="sample_name", dtype="str").save(), ln.Feature(name="sample_type", dtype=sample_type).save(), ], coerce=True, ).save() curator = ln.curators.DataFrameCurator(df, schema) try: curator.validate() except ln.errors.ValidationError as error: assert "1 term not validated in feature 'sample_type': 'Type B'" in str(error) # add new from subtype curator.cat.non_validated["sample_type"] curator.cat.add_new_from("sample_type") curator.validate() assert sample_type.records.to_list("name") == ["Type A", "Type B"] # clean up schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) ln.Record.filter().update(type=None) ln.Record.filter().delete(permanent=True) def test_index_feature_exclusion_from_categoricals(df): df_indexed = df.set_index("sample_id") sample_type_feature = ln.Feature(name="sample_type", dtype="cat[ULabel]").save() sample_id_feature = ln.Feature(name="sample_id", dtype="cat[ULabel]").save() # schema with sample_id as index (not in features) schema = ln.Schema(features=[sample_type_feature], index=sample_id_feature).save() curator = ln.curators.DataFrameCurator(df_indexed, schema) # Verify that only sample_type is in categoricals, not sample_id (index) categoricals_names = [ f.name for f in curator._atomic_curator._cat_manager._categoricals ] assert "sample_type" in categoricals_names assert "sample_id" not in categoricals_names # Verify the cat_vectors do not include the index feature cat_vector_keys = list(curator.cat._cat_vectors.keys()) assert "sample_type" in cat_vector_keys assert "sample_id" not in cat_vector_keys assert "columns" in cat_vector_keys # clean up schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) ================================================ FILE: tests/core/test_data_migrations.py ================================================ """Tests for PostgreSQL data migrations.""" import os import lamindb as ln import pytest @pytest.mark.skipif( os.getenv("LAMINDB_TEST_DB_VENDOR") != "postgresql", reason="PostgreSQL-specific migration test", ) def test_migrate_auxiliary_fields_postgres(): """Test PostgreSQL migration of auxiliary fields for models. This test verifies that migrate_auxiliary_fields_postgres correctly migrates: **Artifact:** - _save_completed from _aux['af']['0'] **Run:** - cli_args from _aux['af']['0'] **Feature:** - default_value from _aux['af']['0'] - nullable from _aux['af']['1'] (default: True) - coerce from _aux['af']['2'] (default: False) - For type features, all values are set to NULL **Schema:** - coerce from _aux['af']['0'] - flexible from _aux['af']['2'] (or computes from n_members) - Converts negative n_members to NULL - For type schemas, all values are set to NULL - Preserves '1' (optionals) and '3' (index_feature_uid) in _aux """ from django.db import connection from lamindb.models.schema import migrate_auxiliary_fields_postgres # === Setup test data === # Create a Transform and Run for testing transform = ln.Transform(key="test_migration_transform").save() run = ln.Run(transform=transform).save() # Create an Artifact for testing artifact = ln.Artifact(".gitignore", key="test_migration_artifact").save() # Create Features for testing (type and regular) type_feature = ln.Feature( name="TestMigrationTypeFeat", dtype=str, is_type=True ).save() regular_feature = ln.Feature(name="test_migration_regular_feat", dtype=str).save() # Create Schemas for testing (type and regular) type_schema = ln.Schema(name="TestMigrationTypeSchema", is_type=True).save() feature_for_schema1 = ln.Feature( name="test_migration_schema_feat1", dtype=str ).save() feature_for_schema2 = ln.Feature( name="test_migration_schema_feat2", dtype=str ).save() regular_schema = ln.Schema( name="TestMigrationRegularSchema", features=[feature_for_schema1, feature_for_schema2], coerce=True, flexible=True, ).save() # === Add _save_completed column temporarily (removed in migration 0173) === with connection.cursor() as cursor: cursor.execute( """ DO $$ BEGIN IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed' ) THEN ALTER TABLE lamindb_artifact ADD COLUMN _save_completed BOOLEAN; END IF; END $$; """ ) # === Set old-style _aux data to simulate pre-migration state === with connection.cursor() as cursor: # Artifact: set _aux with af containing _save_completed value cursor.execute( """ UPDATE lamindb_artifact SET _aux = '{"af": {"0": true}}'::jsonb, _save_completed = NULL WHERE id = %s """, [artifact.id], ) # Run: set _aux with af containing cli_args value cursor.execute( """ UPDATE lamindb_run SET _aux = '{"af": {"0": "--verbose --debug"}}'::jsonb, cli_args = NULL WHERE id = %s """, [run.id], ) # Feature (type): set _aux with af keys that should result in NULL values cursor.execute( """ UPDATE lamindb_feature SET _aux = '{"af": {"0": "default_val", "1": false, "2": true}}'::jsonb, default_value = NULL, nullable = NULL, coerce = NULL WHERE id = %s """, [type_feature.id], ) # Feature (regular): set _aux with af keys for migration cursor.execute( """ UPDATE lamindb_feature SET _aux = '{"af": {"0": "my_default", "1": false, "2": true}}'::jsonb, default_value = NULL, nullable = NULL, coerce = NULL WHERE id = %s """, [regular_feature.id], ) # Schema (type): set _aux with af keys that should be cleaned cursor.execute( """ UPDATE lamindb_schema SET _aux = '{"af": {"0": true, "2": false}}'::jsonb, coerce = NULL, flexible = NULL WHERE id = %s """, [type_schema.id], ) # Schema (regular): set _aux with af keys including optionals (key "1") cursor.execute( """ UPDATE lamindb_schema SET _aux = '{"af": {"0": true, "1": ["uid1", "uid2"], "2": true}}'::jsonb, coerce = NULL, flexible = NULL WHERE id = %s """, [regular_schema.id], ) # === Run the migration function === with connection.schema_editor() as schema_editor: migrate_auxiliary_fields_postgres(schema_editor) # === Refresh all objects from database === run.refresh_from_db() type_feature.refresh_from_db() regular_feature.refresh_from_db() type_schema.refresh_from_db() regular_schema.refresh_from_db() # === Verify Artifact migration === with connection.cursor() as cursor: cursor.execute( "SELECT _save_completed, _aux FROM lamindb_artifact WHERE id = %s", [artifact.id], ) row = cursor.fetchone() assert row[0] is True # _save_completed from _aux['af']['0'] # _aux should have 'af' removed (was only key) assert row[1] is None or "af" not in ( row[1] if isinstance(row[1], dict) else {} ) # === Verify Run migration === assert run.cli_args == "--verbose --debug" # from _aux['af']['0'] # _aux should have 'af' removed assert run._aux is None or "af" not in run._aux # === Verify Feature (type) migration === # Type features should have all values set to NULL assert type_feature.default_value is None assert type_feature.nullable is None assert type_feature.coerce is None # _aux should have 'af' removed assert type_feature._aux is None or "af" not in type_feature._aux # === Verify Feature (regular) migration === assert regular_feature.default_value == "my_default" # from _aux['af']['0'] assert regular_feature.nullable is False # from _aux['af']['1'] assert regular_feature.coerce is True # from _aux['af']['2'] # _aux should have 'af' removed assert regular_feature._aux is None or "af" not in regular_feature._aux # === Verify Schema (type) migration === assert type_schema.coerce is None assert type_schema.flexible is None assert type_schema.n_members is None # _aux should either be None or not have '0' and '2' keys in 'af' if type_schema._aux is not None and "af" in type_schema._aux: assert "0" not in type_schema._aux["af"] assert "2" not in type_schema._aux["af"] # === Verify Schema (regular) migration === assert regular_schema.coerce is True # from _aux['af']['0'] assert regular_schema.flexible is True # from _aux['af']['2'] # _aux should preserve key '1' (optionals) assert regular_schema._aux is not None assert "af" in regular_schema._aux assert "1" in regular_schema._aux["af"] assert regular_schema._aux["af"]["1"] == ["uid1", "uid2"] # Keys '0' and '2' should be removed assert "0" not in regular_schema._aux["af"] assert "2" not in regular_schema._aux["af"] # === Clean up: remove temporary column and delete records === with connection.cursor() as cursor: cursor.execute( """ DO $$ BEGIN IF EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_name = 'lamindb_artifact' AND column_name = '_save_completed' ) THEN ALTER TABLE lamindb_artifact DROP COLUMN _save_completed; END IF; END $$; """ ) regular_schema.delete(permanent=True) type_schema.delete(permanent=True) feature_for_schema1.delete(permanent=True) feature_for_schema2.delete(permanent=True) regular_feature.delete(permanent=True) type_feature.delete(permanent=True) artifact.delete(permanent=True) run.delete(permanent=True) transform.delete(permanent=True) ================================================ FILE: tests/core/test_db.py ================================================ import lamindb as ln def test_create_to_load(): transform = ln.Transform(version="0", key="test", kind="pipeline") transform.save() run = ln.Run(transform=transform) run.save() ln.Storage.get(root=str(ln.setup.settings.storage.root)) ================================================ FILE: tests/core/test_delete.py ================================================ import bionty as bt import lamindb as ln import pytest @pytest.mark.parametrize("permanent", [True, False]) def test_delete_qs(permanent): """Test deletion behavior for small (1) and large (>=2) querysets. Small querysets delete individually, large ones trigger bulk delete.""" ln.settings.creation.search_names = False labels = [ln.Record(name=f"label_{i}") for i in range(3)] ln.settings.creation.search_names = True ln.save(labels) ln.Record.filter(name__startswith="label_").delete(permanent=permanent) assert ln.Record.filter(name__startswith="label_", branch_id=-1).count() == ( 0 if permanent else 3 ) assert ln.ULabel.filter(name__startswith="label_").count() == 0 def test_recreate_soft_deleted_record(): # testing soft delete and recreate with postgres (sqlite is tested in curators/test_records.py) # soft delete a record, then recreate it with some changes record = bt.Ethnicity.from_source(ontology_id="HANCESTRO:0006").save() assert record.branch_id == 1 record.delete() assert record.branch_id == -1 # now recreate the same record from ontology_id with a different description # there's a unique constraint on ontology_id, so this should recover the trashed record record = bt.Ethnicity.from_source(ontology_id="HANCESTRO:0006") record.description = "new description" record.save() # now this record is recovered from the trash with the new description assert record.branch_id == 1 assert record.description == "new description" bt.Ethnicity.objects.filter().delete() ================================================ FILE: tests/core/test_feature.py ================================================ import bionty as bt import lamindb as ln import pandas as pd import pytest from lamindb.errors import ValidationError from lamindb.models.feature import serialize_pandas_dtype from pandas.api.types import is_string_dtype @pytest.fixture(scope="module") def dict_data(): return { "dict_feat1": 42, "dict_feat2": 3.14, "dict_feat3": "somestring", # string (ambiguous cat ? str) "dict_feat4": True, "dict_feat5": [1, 2, 3], "dict_feat6": ["a", "b", "c"], # list[str] (ambiguous list[cat ? str]) "dict_feat7": {"key": "value"}, } def test_feature_init(): # positional args not supported with pytest.raises(ValueError): ln.Feature("x") # dtype required unless is_type=True with pytest.raises(ValidationError): ln.Feature(name="feat") # is OK if also is_type is passed ln.Feature(name="Feat", is_type=True) # invalid dtype string with pytest.raises(ValueError): ln.Feature(name="feat", dtype="x") # categorical dtype must specify valid types with pytest.raises(ValidationError): ln.Feature(name="feat", dtype="cat[1]") # ensure feat1 does not exist if feat1 := ln.Feature.filter(name="feat1").one_or_none() is not None: feat1.delete(permanent=True) feat1 = ln.Feature(name="feat", dtype="str").save() # duplicate name with different dtype should fail with pytest.raises(ValidationError) as error: ln.Feature(name="feat", dtype=ln.ULabel) assert ( error.exconly() == "lamindb.errors.ValidationError: Feature feat already exists with dtype str, you passed cat[ULabel]" ) feat1.delete(permanent=True) # string and list syntax for categorical dtypes should be equivalent and work feat2 = ln.Feature(name="feat2", dtype="str", description="feat2").save() feat2_again = ln.Feature(name="feat2", dtype="str", description="feat2").save() assert feat2 == feat2_again feat2.delete(permanent=True) # categorical dtype with union of registries using string syntax must be valid feature = ln.Feature(name="feat1", dtype="cat[Record|bionty.Gene]") assert feature._dtype_str == "cat[Record|bionty.Gene]" # categorical dtype with union of registries using objects must be valid feature = ln.Feature(name="feat1", dtype=[ln.Record, bt.Gene]) assert feature._dtype_str == "cat[Record|bionty.Gene]" # dtype with field name before bracket filters must be valid feature = ln.Feature( name="gene_feature", dtype="cat[bionty.Gene.ensembl_gene_id[organism='human']]" ) print(feature._dtype_str) assert "bionty.Gene" in feature._dtype_str assert "ensembl_gene_id" in feature._dtype_str assert "organism='human'" in feature._dtype_str # @pytest.mark.skipif( # os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only" # ) # def test_cannot_mutate_dtype(): # feature = ln.Feature(name="feature", dtype=str).save() # feature._dtype_str = int # with pytest.raises(django.db.utils.IntegrityError) as error: # feature.save() # assert "dtype field is immutable and cannot be changed" in error.exconly() # feature.delete(permanent=True) # def test_cat_filters_dtype(): # feature = ln.Feature( # name="disease", # dtype=bt.Disease, # cat_filters={ # "source__uid": "4a3ejKuf" # }, # uid corresponds to disease_ontology_old.uid # ).save() # assert feature._dtype_str == "cat[bionty.Disease[source__uid='4a3ejKuf']]" # feature.delete(permanent=True) def test_cat_filters_empty_filter(): # empty filter values should be rejected with pytest.raises(ValidationError) as error: ln.Feature(name="feat_empty", dtype=bt.Disease, cat_filters={"source__uid": ""}) assert ( "lamindb.errors.ValidationError: Empty value in filter source__uid" in error.exconly() ) def test_cat_filters_invalid_field_name(): # invalid filter field names should be rejected source = bt.Source( name="", description="", organism="", entity="", version="" ).save() with pytest.raises(ValidationError) as error: ln.Feature( name="feat_invalid_attr", dtype=bt.Disease, cat_filters={"source__invalid_field": source}, ) assert ( "lamindb.errors.ValidationError: SQLRecord Source has no attribute 'invalid_field' in filter source__invalid_field" in error.exconly() ) source.delete(permanent=True) def test_cat_filters_artifact_schema_filter(): schema_feature = ln.Feature(name="schema_filter_column", dtype=str).save() schema = ln.Schema(name="schema_filter_schema", features=[schema_feature]).save() try: feature = ln.Feature( name="artifact_input", dtype=ln.Artifact, cat_filters={"schema": schema}, ) assert feature._dtype_str == f"cat[Artifact[schema__uid='{schema.uid}']]" finally: schema.delete(permanent=True) schema_feature.delete(permanent=True) def test_feature_from_df(): df = pd.DataFrame( { "feat1": [1, 2, 3], "feat2": [3.1, 4.2, 5.3], "feat3": pd.Categorical(["cond1", "cond2", "cond2"]), "feat4": ["id1", "id2", "id3"], "rando_feature": ["rando1", "rando2", "rando3"], } ) if feat1 := ln.Feature.filter(name="feat1").one_or_none() is not None: feat1.delete(permanent=True) features = ln.Feature.from_dataframe(df.iloc[:, :4]).save() artifact = ln.Artifact.from_dataframe(df, description="test").save() # test for deprecated add_feature_set schema = ln.Schema(features).save() artifact.features._add_schema(schema, slot="columns") features = artifact.features.slots["columns"].features.all() assert len(features) == len(df.columns[:4]) [col for col in df.columns if is_string_dtype(df[col])] categoricals = { col: df[col] for col in df.columns if isinstance(df[col], pd.CategoricalDtype) } for feature in features: if feature.name in categoricals: assert feature._dtype_str == "cat" else: orig_type = df[feature.name].dtype assert feature._dtype_str == serialize_pandas_dtype(orig_type) for feature in features: feature.save() labels = [ln.Record(name=name) for name in df["feat3"].unique()] ln.save(labels) feature = ln.Feature.get(name="feat3") with pytest.raises(ValidationError) as err: artifact.labels.add(labels, feature=feature) assert ( err.exconly() == "lamindb.errors.ValidationError: Cannot manually annotate a feature measured *within* the dataset. Please use a Curator." ) extfeature = ln.Feature(name="extfeat", dtype="str").save() with pytest.raises(ValidationError) as err: artifact.labels.add(labels, feature=extfeature) assert ( err.exconly() == f"lamindb.errors.ValidationError: Feature {extfeature.name} needs dtype='cat' for label annotation, currently has dtype='str'" ) # clean up artifact.delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Record.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_feature_from_dict(dict_data): # defaults to str for ambiguous types features = ln.Feature.from_dict(dict_data) assert len(features) == len(dict_data) assert features[0]._dtype_str == "int" assert features[1]._dtype_str == "float" assert features[2]._dtype_str == "str" assert features[3]._dtype_str == "bool" assert features[4]._dtype_str == "list[int]" assert features[5]._dtype_str == "list[str]" assert features[6]._dtype_str == "dict" # Wrong field with pytest.raises(ValueError) as e: ln.Feature.from_dict(dict_data, field=ln.Record.name) assert "field must be a Feature FieldAttr" in str(e.value) # Explicit field features_with_field = ln.Feature.from_dict(dict_data, field=ln.Feature.name) assert len(features_with_field) == len(dict_data) def test_feature_from_dict_type(dict_data): feature_type = ln.Feature(name="Testdata_feature_type", is_type=True).save() features = ln.Feature.from_dict(dict_data, type=feature_type).save() for feature in features: assert feature.type.name == "Testdata_feature_type" ln.Feature.filter(type__isnull=False).delete(permanent=True) feature_type.delete(permanent=True) def test_feature_query_by_dtype(): """Test querying Feature by dtype (deprecated) and _dtype_str.""" str_feat = ln.Feature(name="test_str_feat", dtype=str).save() int_feat = ln.Feature(name="test_int_feat", dtype=int).save() try: # Test querying by _dtype_str (current way) str_features = ln.Feature.filter(_dtype_str="str", name="test_str_feat") assert str_features.count() == 1 assert str_features.first() == str_feat str_features = ln.Feature.filter(dtype_as_str="str", name="test_str_feat") assert str_features.count() == 1 assert str_features.first() == str_feat # Test querying by dtype (deprecated) - should work but issue warning with pytest.warns( DeprecationWarning, match="Querying Feature by `dtype` is deprecated.*Notice the new dtype encoding format", ): str_features_deprecated = ln.Feature.filter( dtype="str", name="test_str_feat" ) assert str_features_deprecated.count() == 1 assert str_features_deprecated.first() == str_feat finally: # Clean up str_feat.delete(permanent=True) int_feat.delete(permanent=True) ================================================ FILE: tests/core/test_feature_dtype.py ================================================ import datetime import bionty as bt import lamindb as ln import pandas as pd import pytest from lamindb import Record from lamindb.errors import ValidationError from lamindb.models.feature import ( dtype_as_object, parse_dtype, parse_filter_string, resolve_relation_filters, serialize_dtype, ) @pytest.fixture def organism(): organism = bt.Organism(name="test_organism") organism.uid = "testuid2" organism.save() return organism # ----------------------------------------------------------------------------- # serializing dtypes # ----------------------------------------------------------------------------- def test_serialize_basic_dtypes(): assert serialize_dtype(int) == "int" assert serialize_dtype(float) == "float" assert serialize_dtype(str) == "str" assert serialize_dtype(bool) == "bool" assert serialize_dtype(dict) == "dict" # assert serialize_dtype(bytes) == "bytes" # not yet supported assert serialize_dtype(datetime.datetime) == "datetime" assert serialize_dtype(datetime.date) == "date" def test_serialize_basic_list_dtypes(): assert serialize_dtype(list[int]) == "list[int]" assert serialize_dtype(list[float]) == "list[float]" assert serialize_dtype(list[str]) == "list[str]" assert serialize_dtype(list[bool]) == "list[bool]" assert serialize_dtype(list[dict]) == "list[dict]" assert serialize_dtype(list[datetime.datetime]) == "list[datetime]" assert serialize_dtype(list[datetime.date]) == "list[date]" def test_seralize_pandas_numpy_dtypes(): series = pd.Series([1, 4, 0, 10, 9], dtype="uint") assert series.dtype.name == "uint64" assert serialize_dtype(series.dtype) == "int" def test_serialize_user(ccaplog): # correct way through Python object and serialize_dtype() feature = ln.Feature(name="user_feat", dtype=ln.User) assert feature._dtype_str == "cat[User]" # legacy way through parse_dtype() feature = ln.Feature(name="user_feat", dtype="cat[User]") assert ( "rather than passing a string 'cat[User]' to dtype, consider passing a Python object" in ccaplog.text ) assert feature._dtype_str == "cat[User]" def test_serialize_record_objects(): insitute_type = ln.Record(name="InstituteA", is_type=True) with pytest.raises(ln.errors.InvalidArgument) as error: serialize_dtype(insitute_type) assert ( f"Cannot serialize unsaved objects. Save {insitute_type} via `.save()`." in error.exconly() ) insitute_type.save() lab_type = ln.Record(name="LabB", type=insitute_type, is_type=True).save() sample_type = ln.Record(name="Sample", type=lab_type, is_type=True).save() # New UID-based format: cat[Record[uid]] instead of cat[Record[Parent[Child]]] serialized_str = f"cat[Record[{sample_type.uid}]]" feature = ln.Feature(name="sample_feature", dtype=sample_type).save() assert feature._dtype_str == serialized_str assert feature.dtype == "cat[Record[InstituteA[LabB[Sample]]]]" feature.delete(permanent=True) assert serialize_dtype(sample_type) == serialized_str with pytest.raises(ln.errors.IntegrityError) as error: parse_dtype("cat[Record[Sample]]", check_exists=True, old_format=True) assert ( "No Record type found matching subtypes ['Sample'] for field `.name`" in error.exconly() ) sample = ln.Record(name="sample").save() with pytest.raises(ln.errors.InvalidArgument) as error: parse_dtype(f"cat[Record[{sample.uid}]]", check_exists=True) assert ( f"The resolved Record 'sample' (uid='{sample.uid}') is not a type: is_type is False." in error.exconly() ) with pytest.raises(ln.errors.InvalidArgument) as error: serialize_dtype(sample) assert ( "Cannot serialize non-type Record 'sample'. Only types (is_type=True) are allowed in dtypes." in error.exconly() ) sample_type.delete(permanent=True) lab_type.delete(permanent=True) insitute_type.delete(permanent=True) sample.delete(permanent=True) def test_serialize_union_of_registries(): serialized_str = "cat[Record|bionty.Gene]" assert serialize_dtype([ln.Record, bt.Gene]) == serialized_str serialized_str = "cat[bionty.CellType|bionty.CellLine]" assert serialize_dtype([bt.CellType, bt.CellLine]) == serialized_str def test_serialize_with_field_information(): serialized_str = "cat[bionty.Gene.ensembl_gene_id]" assert serialize_dtype(bt.Gene.ensembl_gene_id) == serialized_str serialized_str = "cat[bionty.CellType.uid|bionty.CellLine.uid]" assert serialize_dtype([bt.CellType.uid, bt.CellLine.uid]) == serialized_str # ----------------------------------------------------------------------------- # parsing serialized dtypes # ----------------------------------------------------------------------------- def test_simple_record_with_subtype_and_field(): # Create a Record type to get its UID customer_type = ln.Record(name="Customer", is_type=True).save() dtype_str = f"cat[Record[{customer_type.uid}].name]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": customer_type.uid, } customer_type.delete(permanent=True) def test_multiple_records_with_subtypes_and_fields(): # Create Record types to get their UIDs customer_type = ln.Record(name="Customer", is_type=True).save() supplier_type = ln.Record(name="Supplier", is_type=True).save() dtype_str = ( f"cat[Record[{customer_type.uid}].name|Record[{supplier_type.uid}].name]" ) result = parse_dtype(dtype_str) assert len(result) == 2 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": customer_type.uid, } assert result[1] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": supplier_type.uid, } customer_type.delete(permanent=True) supplier_type.delete(permanent=True) def test_bionty_celltype_with_field(): dtype_str = "cat[bionty.CellType.ontology_id]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "bionty.CellType", "filter_str": "", "field_str": "ontology_id", "registry": bt.CellType, "field": bt.CellType.ontology_id, } def test_bionty_perturbations_with_field(): dtype_str = "cat[bionty.CellType.uid|bionty.CellLine.uid]" result = parse_dtype(dtype_str) assert len(result) == 2 assert result[0] == { "registry_str": "bionty.CellType", "filter_str": "", "field_str": "uid", "registry": bt.CellType, "field": bt.CellType.uid, } assert result[1] == { "registry_str": "bionty.CellLine", "filter_str": "", "field_str": "uid", "registry": bt.CellLine, "field": bt.CellLine.uid, } def test_invalid_registry(): dtype_str = "cat[InvalidRegistry.field]" with pytest.raises(ValidationError) as exc_info: parse_dtype(dtype_str) assert "invalid dtype" in str(exc_info.value) def test_empty_category(): dtype_str = "cat[]" result = parse_dtype(dtype_str) assert result == [] def test_url_dtype_is_supported(): assert parse_dtype("url") == [] feature = ln.Feature(name="website", dtype="url") assert feature._dtype_str == "url" def test_malformed_categorical(): dtype_str = "cat ? str" with pytest.raises(ValueError) as err: parse_dtype(dtype_str) assert err.exconly().startswith( f"ValueError: dtype is '{dtype_str}' but has to be one of" ) dtype_str = "cat[Record[Customer.name" with pytest.raises(ValueError) as err: parse_dtype(dtype_str) assert err.exconly().startswith( f"ValueError: dtype is '{dtype_str}' but has to be one of" ) def test_simple_registry_without_field(): dtype_str = "cat[Record]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, } def test_registry_with_subtype_no_field(): # Create a Record type to get its UID customer_type = ln.Record(name="Customer", is_type=True).save() dtype_str = f"cat[Record[{customer_type.uid}]]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": customer_type.uid, } customer_type.delete(permanent=True) def test_list_of_dtypes(): # Create a Record type to get its UID customer_type = ln.Record(name="Customer", is_type=True).save() dtype_str = f"list[cat[Record[{customer_type.uid}]]]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": customer_type.uid, "list": True, } assert serialize_dtype(list[bt.CellLine]) == "list[cat[bionty.CellLine]]" customer_type.delete(permanent=True) def test_registry_with_filter(): dtype_str = "cat[bionty.Gene.ensembl_gene_id[source__id='abcd']]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "bionty.Gene", "filter_str": "source__id='abcd'", "field_str": "ensembl_gene_id", "registry": bt.Gene, "field": bt.Gene.ensembl_gene_id, } def test_nested_cat_dtypes(): # Create Record types - the deepest type is UScustomer customer_type = ln.Record(name="Customer", is_type=True).save() uscustomer_type = ln.Record( name="UScustomer", type=customer_type, is_type=True ).save() dtype_str = f"cat[Record[{uscustomer_type.uid}].name]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "name", "registry": Record, "field": Record.name, "record_uid": uscustomer_type.uid, } uscustomer_type.delete(permanent=True) customer_type.delete(permanent=True) def test_nested_cat_with_filter(): # Create Record types - the deepest type is UScustomer # Note: filters in bracket content are not currently supported in UID format # This test may need adjustment based on how filters are handled customer_type = ln.Record(name="Customer", is_type=True).save() uscustomer_type = ln.Record( name="UScustomer", type=customer_type, is_type=True ).save() dtype_str = f"cat[Record[{uscustomer_type.uid}].description]" result = parse_dtype(dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "Record", "filter_str": "", "field_str": "description", "registry": Record, "field": Record.description, "record_uid": uscustomer_type.uid, } uscustomer_type.delete(permanent=True) customer_type.delete(permanent=True) # ----------------------------------------------------------------------------- # parsing django filter expressions # ----------------------------------------------------------------------------- def test_feature_dtype(): feature = ln.Feature( name="disease", dtype=bt.Disease, cat_filters={ "source__uid": "4a3ejKuf" }, # uid corresponds to disease_ontology_old.uid ).save() result = parse_dtype(feature._dtype_str) assert len(result) == 1 assert result[0] == { "registry_str": "bionty.Disease", "filter_str": "source__uid='4a3ejKuf'", "field_str": "name", "registry": bt.Disease, "field": bt.Disease.name, } feature.delete(permanent=True) def test_cat_filters_incompatible_with_union_dtypes(): with pytest.raises(ValidationError) as exc_info: ln.Feature( name="test_feature", dtype="cat[Record|bionty.CellType]", cat_filters={"source": "test"}, ) assert ( "cat_filters are incompatible with union dtypes: 'cat[Record|bionty.CellType]'" in str(exc_info.value) ) def test_cat_filters_incompatible_with_nested_dtypes(): record = ln.Record(name="Customer", is_type=True).save() with pytest.raises(ValidationError) as exc_info: ln.Feature( name="test_feature", dtype=record, cat_filters={"source": "test"}, ) assert ( f"cat_filters are incompatible with nested dtypes: 'cat[Record[{record.uid}]]'" in str(exc_info.value) ) record.delete(permanent=True) def test_parse_filter_string_basic(): result = parse_filter_string("parent__id=123, category__name=electronics") expected = { "parent__id": ("parent", "id", "123"), "category__name": ("category", "name", "electronics"), } assert result == expected def test_parse_filter_string_direct_fields(): result = parse_filter_string("name=test, status=active") expected = {"name": ("name", None, "test"), "status": ("status", None, "active")} assert result == expected def test_parse_filter_string_empty(): with pytest.raises(ValueError) as e: parse_filter_string("") assert "missing '=' sign" in str(e) def test_parse_filter_string_malformed(): with pytest.raises(ValueError) as e: parse_filter_string("malformed_filter") assert "missing '=' sign" in str(e) def test_parse_filter_string_missing_key(): with pytest.raises(ValueError) as e: parse_filter_string("=someval") assert "empty key" in str(e) def test_parse_filter_string_missing_value(): with pytest.raises(ValueError) as e: parse_filter_string("somekey=") assert "empty val" in str(e) def test_resolve_direct_fields(): parsed = {"name": ("name", None, "test"), "status": ("status", None, "active")} result = resolve_relation_filters(parsed, bt.Gene) assert result == {"name": "test", "status": "active"} def test_resolve_relation_filter_with_uid(): source = bt.Source( name="test_name", description="test_description", organism="human", entity="bionty.Gene", version="2026-01-01", ) source.uid = "testuid1" source.save() parsed = {"source__uid": ("source", "uid", "testuid1")} result = resolve_relation_filters(parsed, bt.Gene) print(result) assert result == {"source": source} source.delete(permanent=True) def test_resolve_relation_filter_with_name(organism): parsed = {"organism__name": ("organism", "name", "test_organism")} result = resolve_relation_filters(parsed, bt.Gene) assert result == {"organism": organism} organism.delete(permanent=True) def test_resolve_multiple_relation_filters(organism): source = bt.Source( name="test_name", description="test_description", organism="human", entity="bionty.Gene", version="2026-01-01", ) source.uid = "testuid1" source.save() parsed = { "organism__name": ("organism", "name", "test_organism"), "source__uid": ("source", "uid", "testuid1"), } result = resolve_relation_filters(parsed, bt.Gene) assert result == {"organism": organism, "source": source} source.delete(permanent=True) organism.delete(permanent=True) def test_resolve_nested_filter(organism): parsed = {"organism__name__contains": ("organism", "name__contains", "test_orga")} result = resolve_relation_filters(parsed, bt.Gene) assert result == {"organism": organism} organism.delete(permanent=True) def test_resolve_relation_filter_failed_resolution(): parsed = {"organism__name": ("organism", "name", "nonexistent")} with pytest.raises(bt.Organism.DoesNotExist): resolve_relation_filters(parsed, bt.Gene) def test_resolve_relation_filter_duplicate(): parsed = { "source__uid": ("source", "uid", "testuid1"), "source__name": ("source", "name", "test_name"), } with pytest.raises(bt.Source.DoesNotExist): resolve_relation_filters(parsed, bt.Gene) # ----------------------------------------------------------------------------- # backward compatibility for old format strings # ----------------------------------------------------------------------------- def test_convert_old_format_ulabel_string(): """Test converting old format ULabel string to object.""" # Create a ULabel type perturbation = ln.ULabel(name="Perturbation", is_type=True).save() # Convert old format string dtype_obj = dtype_as_object("cat[ULabel[Perturbation]]", old_format=True) # Should return the ULabel object assert dtype_obj == perturbation assert hasattr(dtype_obj, "uid") # Clean up perturbation.delete(permanent=True) def test_convert_old_format_record_string(): """Test converting old format Record string to object.""" # Create a Record type sample_type = ln.Record(name="Sample", is_type=True).save() # Convert old format string dtype_obj = dtype_as_object("cat[Record[Sample]]", old_format=True) # Should return the Record object assert dtype_obj == sample_type assert hasattr(dtype_obj, "uid") # Clean up sample_type.delete(permanent=True) def test_convert_old_format_nested_record_string(): """Test converting old format nested Record string to object.""" # Create nested Record types lab_type = ln.Record(name="LabA", is_type=True).save() experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save() # Convert old format string dtype_obj = dtype_as_object("cat[Record[LabA[Experiment]]]", old_format=True) # Should return the nested Record object assert dtype_obj == experiment_type assert hasattr(dtype_obj, "uid") # Clean up experiment_type.delete(permanent=True) lab_type.delete(permanent=True) def test_convert_old_format_list_string(): """Test converting old format list string to object.""" # Create a ULabel type perturbation = ln.ULabel(name="Perturbation", is_type=True).save() # Convert old format string with list wrapper dtype_obj = dtype_as_object("list[cat[ULabel[Perturbation]]]", old_format=True) # Should return list[ULabel] type assert hasattr(dtype_obj, "__origin__") assert dtype_obj.__origin__ is list # Get the inner type from typing import get_args inner_type = get_args(dtype_obj)[0] assert inner_type == perturbation # Clean up perturbation.delete(permanent=True) def test_feature_constructor_with_old_format_string(ccaplog): """Test Feature constructor with old format string raises deprecation warning.""" # Create a ULabel type perturbation = ln.ULabel(name="Perturbation", is_type=True).save() # Create feature with old format string feature = ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]") assert ( "rather than passing a string 'cat[ULabel[Perturbation]]' to dtype, consider passing a Python object" in ccaplog.text ) # Should have converted to UID format assert feature._dtype_str is not None assert "ULabel[" in feature._dtype_str # Should contain UID, not name assert "Perturbation" not in feature._dtype_str assert perturbation.uid in feature._dtype_str # Clean up perturbation.delete(permanent=True) def test_feature_constructor_with_old_format_nested_string(ccaplog): """Test Feature constructor with old format nested string.""" # Create nested Record types lab_type = ln.Record(name="LabA", is_type=True).save() experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save() # Create feature with old format nested string feature = ln.Feature(name="experiment", dtype="cat[Record[LabA[Experiment]]]") assert ( "rather than passing a string 'cat[Record[LabA[Experiment]]]' to dtype, consider passing a Python object" in ccaplog.text ) # Should have converted to UID format assert feature._dtype_str is not None assert "Record[" in feature._dtype_str # Should contain UID, not names assert "LabA" not in feature._dtype_str assert "Experiment" not in feature._dtype_str assert experiment_type.uid in feature._dtype_str # Clean up experiment_type.delete(permanent=True) lab_type.delete(permanent=True) def test_bare_cat_dtype_backward_compatibility(): """Test that bare 'cat' dtype is accepted for backward compatibility.""" # Test parse_dtype accepts "cat" and returns empty list result = parse_dtype("cat") assert result == [] # Test Feature constructor with bare "cat" dtype issues deprecation warning with pytest.warns(DeprecationWarning, match="dtype `cat` is deprecated"): feature = ln.Feature(name="test_bare_cat", dtype="cat") assert feature._dtype_str == "cat" def test_migrate_dtype_to_uid_format(): """Test migrate_dtype_to_uid_format() function for migration.""" from django.db import connection from lamindb.models.feature import migrate_dtype_to_uid_format # Create Record types for testing lab_type = ln.Record(name="LabA", is_type=True).save() experiment_type = ln.Record(name="Experiment", type=lab_type, is_type=True).save() perturbation = ln.ULabel(name="Perturbation", is_type=True).save() # Create features with old format strings in _dtype_str feature1 = ln.Feature(name="test_record_old_format", dtype="str").save() feature2 = ln.Feature(name="test_ulabel_old_format", dtype="str").save() feature3 = ln.Feature(name="test_list_record_old_format", dtype="str").save() feature4 = ln.Feature(name="test_list_ulabel_old_format", dtype="str").save() # Manually set old format strings using raw SQL old_format_record = "cat[Record[LabA[Experiment]]]" old_format_ulabel = "cat[ULabel[Perturbation]]" old_format_list_record = "list[cat[Record[LabA[Experiment]]]]" old_format_list_ulabel = "list[cat[ULabel[Perturbation]]]" with connection.cursor() as cursor: cursor.execute( "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s", [old_format_record, feature1.id], ) cursor.execute( "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s", [old_format_ulabel, feature2.id], ) cursor.execute( "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s", [old_format_list_record, feature3.id], ) cursor.execute( "UPDATE lamindb_feature SET _dtype_str = %s WHERE id = %s", [old_format_list_ulabel, feature4.id], ) # Refresh features from database feature1.refresh_from_db() feature2.refresh_from_db() feature3.refresh_from_db() feature4.refresh_from_db() # Verify old format is present assert feature1._dtype_str == old_format_record assert feature2._dtype_str == old_format_ulabel assert feature3._dtype_str == old_format_list_record assert feature4._dtype_str == old_format_list_ulabel # Run migration function migrate_dtype_to_uid_format(connection, input_field="_dtype_str") # Refresh features from database feature1.refresh_from_db() feature2.refresh_from_db() feature3.refresh_from_db() feature4.refresh_from_db() # Verify conversion to UID format assert feature1._dtype_str == f"cat[Record[{experiment_type.uid}]]" assert feature2._dtype_str == f"cat[ULabel[{perturbation.uid}]]" assert feature3._dtype_str == f"list[cat[Record[{experiment_type.uid}]]]" assert feature4._dtype_str == f"list[cat[ULabel[{perturbation.uid}]]]" # Verify old names are not in the converted strings assert "LabA" not in feature1._dtype_str assert "Experiment" not in feature1._dtype_str assert "Perturbation" not in feature2._dtype_str assert "LabA" not in feature3._dtype_str assert "Experiment" not in feature3._dtype_str assert "Perturbation" not in feature4._dtype_str # Verify UIDs are present assert experiment_type.uid in feature1._dtype_str assert perturbation.uid in feature2._dtype_str assert experiment_type.uid in feature3._dtype_str assert perturbation.uid in feature4._dtype_str # Clean up feature1.delete(permanent=True) feature2.delete(permanent=True) feature3.delete(permanent=True) feature4.delete(permanent=True) experiment_type.delete(permanent=True) lab_type.delete(permanent=True) perturbation.delete(permanent=True) ================================================ FILE: tests/core/test_from_values.py ================================================ import bionty as bt import lamindb as ln import pandas as pd import pytest @pytest.fixture(scope="module") def df(): return pd.DataFrame( ( ["T cell", "CL:0000084"], ["hepatocyte", "CL:0000182"], ["my new cell type", ""], ), columns=["cell_type", "cell_type_id"], ) def test_from_values_name(df): bt.CellType.filter().delete(permanent=True) assert df["cell_type"].tolist() == ["T cell", "hepatocyte", "my new cell type"] # create records from bionty result = bt.CellType.from_values(df.cell_type, "name") ids = [i.ontology_id for i in result] assert len(result) == 2 assert set(ids) == {"CL:0000084", "CL:0000182"} assert result[0].source.entity == "bionty.CellType" # wrong field type with pytest.raises(TypeError): result = bt.CellType.from_values(df.cell_type, field=bt.CellType) def test_from_values_ontology_id(df): assert df["cell_type_id"].tolist() == ["CL:0000084", "CL:0000182", ""] result = bt.CellType.from_values(df.cell_type_id, "ontology_id") names = {i.name for i in result} assert len(result) == 2 assert names == {"T cell", "hepatocyte"} assert result[0].source.entity == "bionty.CellType" def test_from_values_multiple_match(): records = bt.Gene.from_values(["ABC1", "PDCD1"], bt.Gene.symbol, organism="human") assert len(records) == 3 def test_get_or_create_records(): names = ["record" + str(i) for i in range(25)] labels = [ln.Record(name=name) for name in names] ln.save(labels) # more than 20 existing values labels = ln.Record.from_values(names, field="name") assert len(labels) == 25 def test_from_values_synonyms_aware(): bt.CellType.from_source(name="T cell").save() # existing validated values records = bt.CellType.from_values(["T cell"], "name") assert len(records) == 1 assert records[0].name == "T cell" assert isinstance(records[0].source, bt.Source) # existing validated values and synonyms records = bt.CellType.from_values(["T cell", "T-cell"], "name") assert len(records) == 1 assert records[0].name == "T cell" assert isinstance(records[0].source, bt.Source) # bionty values and synonyms records = bt.CellType.from_values(["B-cell", "B cell"], "name") assert len(records) == 1 assert records[0].name == "B cell" assert isinstance(records[0].source, bt.Source) # all possibilities of validated values records = bt.CellType.from_values( ["T cell", "T-cell", "t cell", "B cell", "B-cell"], "name" ) assert len(records) == 2 names = [r.name for r in records] assert set(names) == {"T cell", "B cell"} assert isinstance(records[0].source, bt.Source) assert isinstance(records[1].source, bt.Source) # non-validated values records = bt.CellType.from_values(["T cell", "mycell"], "name") assert len(records) == 1 assert records[0].name == "T cell" assert isinstance(records[0].source, bt.Source) assert records[0].ontology_id == "CL:0000084" bt.CellType.filter().delete(permanent=True) def test_standardize(): # only name field can be standardized results = bt.Gene.from_values( ["HES4", "TNFRSF4"], field=bt.Gene.ensembl_gene_id, organism="human" ) assert len(results) == 0 results = bt.Gene.from_values( ["HES4", "TNFRSF4"], field=bt.Gene.symbol, organism="human" ) assert len(results) == 2 def test_from_values_no_source(): # remove source of ExperimentalFactor source = bt.Source.filter(entity="bionty.ExperimentalFactor").first() source.delete(permanent=True) assert not bt.ExperimentalFactor.from_values(["scrnaseq"]) source.save() ================================================ FILE: tests/core/test_has_parents.py ================================================ import bionty as bt import lamindb as ln def test_view_parents(): label1 = ln.Record(name="label1") label2 = ln.Record(name="label2") label1.save() label2.save() label1.parents.add(label2) label1.view_parents(ln.Record.name, distance=1) label1.delete(permanent=True) label2.delete(permanent=True) def test_query_parents_children(): label1 = ln.Record(name="label1").save() label2 = ln.Record(name="label2").save() label3 = ln.Record(name="label3").save() label1.children.add(label2) label2.children.add(label3) parents = label3.query_parents() assert len(parents) == 2 assert label1 in parents and label2 in parents children = label1.query_children() assert len(children) == 2 assert label2 in children and label3 in children label1.delete(permanent=True) label2.delete(permanent=True) label3.delete(permanent=True) def test_view_lineage_circular(): import pandas as pd transform = ln.Transform(key="test").save() run = ln.Run(transform=transform).save() artifact = ln.Artifact.from_dataframe( pd.DataFrame({"a": [1, 2, 3]}), description="test artifact", run=run ).save() run.input_artifacts.add(artifact) artifact.view_lineage() artifact.delete(permanent=True) transform.delete(permanent=True) def test_view_parents_connected_instance(): ct = bt.CellType.connect("laminlabs/cellxgene").first() if ct and hasattr(ct, "parents"): ct.view_parents(distance=2, with_children=True) def test_query_relatives_connected_instance(): ct = bt.CellType.connect("laminlabs/cellxgene").filter(name="T cell").first() if ct: parents = ct.query_parents() assert parents.db == "laminlabs/cellxgene" children = ct.query_children() assert children.db == "laminlabs/cellxgene" def test_view_lineage_connected_instance(): af = ln.Artifact.connect("laminlabs/cellxgene").first() if af and af.run: af.view_lineage() ================================================ FILE: tests/core/test_has_type.py ================================================ import os import lamindb as ln import pytest from django.db import IntegrityError @pytest.mark.parametrize( "model_class,extra_kwargs", [ (ln.Record, {}), (ln.Feature, {"dtype": "str"}), (ln.Schema, {"itype": ln.Feature}), (ln.Project, {}), (ln.Reference, {}), (ln.ULabel, {}), ], ) def test_invalid_type(model_class, extra_kwargs): # also see test_invalid_type_record_with_schema in test_record.py model_name = model_class.__name__.lower() no_type = model_class(name="no_type", **extra_kwargs).save() if model_name == "schema": extra_kwargs["is_type"] = True # to avoid triggering hash look up with pytest.raises(ValueError) as error: model_class(name="WithInvalidType", type=no_type, **extra_kwargs).save() assert error.exconly().startswith( f"ValueError: You can only assign a {model_name} with `is_type=True` as `type` to another {model_name}" ) # test at the database level if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite": no_type.is_type = True with pytest.raises(IntegrityError) as error: model_class(name="WithInvalidType", type=no_type, **extra_kwargs).save() assert f"{model_name}_type_is_valid_fk" in error.exconly() no_type.delete(permanent=True) @pytest.mark.skipif( os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only" ) @pytest.mark.parametrize("model_class", [ln.Record, ln.ULabel]) def test_prevent_type_cycle(model_class): type_a = model_class(name="TypeA", is_type=True).save() type_b = model_class(name="TypeB", is_type=True).save() # Set A's parent to B type_a.type = type_b type_a.save() # A → B, this is fine # Try to set B's parent to A (would create cycle B → A → B) type_b.type = type_a with pytest.raises(Exception) as exc_info: type_b.save() assert "cycle" in str(exc_info.value).lower() # Try to set type to itself type_a.type = type_a with pytest.raises(Exception) as exc_info: type_a.save() assert "cycle" in str(exc_info.value).lower() type_a.delete(permanent=True) type_b.delete(permanent=True) @pytest.mark.parametrize("model_class", [ln.Record, ln.ULabel, ln.Project]) def test_query_sub_types_super_types_instances(model_class): model_name = model_class.__name__.lower() # Create type hierarchy type1 = model_class(name="Type1", is_type=True).save() type2 = model_class(name="Type2", is_type=True, type=type1).save() type3 = model_class(name="Type3", is_type=True, type=type2).save() # Create instances instance1 = model_class(name=f"{model_name}1", type=type1).save() instance2 = model_class(name=f"{model_name}2", type=type3).save() instance3 = model_class(name=f"{model_name}3", type=type3).save() # Get the query method dynamically query_method = getattr(type1, f"query_{model_name}s") # Children assert getattr(type1, model_name + "s").count() == 2 # direct instances assert query_method().count() == 5 # Super types super_types = instance3.query_types() assert len(super_types) == 3 assert super_types[0] == type3 assert super_types[1] == type2 assert super_types[2] == type1 # Move type2 to trash type2.delete() assert query_method().count() == 1 # Cleanup instance1.delete(permanent=True) instance2.delete(permanent=True) instance3.delete(permanent=True) type3.delete(permanent=True) type2.delete(permanent=True) type1.delete(permanent=True) ================================================ FILE: tests/core/test_integrity.py ================================================ import lamindb_setup as ln_setup def test_migrate_check(): assert ln_setup.migrate.check() def test_system_check(): ln_setup.django("check") ================================================ FILE: tests/core/test_is_versioned.py ================================================ import lamindb as ln import pandas as pd import pytest from lamindb.models._is_versioned import ( _adjust_is_latest_when_deleting_is_versioned, bump_version, set_version, ) @pytest.fixture(scope="module") def df1(): return pd.DataFrame({"feat1": [1, 2]}) @pytest.fixture(scope="module") def df2(): return pd.DataFrame({"feat1": [2, 3]}) def test_set_version(): # all remaining lines are covered in notebooks with pytest.raises(ValueError): set_version(None, "weird-version") assert set_version(None, "1.2") == "2" assert set_version(None, "0") == "1" assert set_version(None, "1") == "2" assert set_version("1.2.3", "0") == "1.2.3" assert set_version("1.2.3") == "1.2.3" def test_bump_version(): current_version_major_only = "2" current_version_major_minor = "2.1" weird_version = "weird-version" with pytest.raises(ValueError): bump_version(weird_version) assert bump_version(weird_version, behavior="ignore") == "?" assert bump_version(current_version_major_only, bump_type="major") == "3" assert bump_version(current_version_major_only, bump_type="minor") == "2.1" assert bump_version(current_version_major_minor, bump_type="major") == "3" assert bump_version(current_version_major_minor, bump_type="minor") == "2.2" def test_add_to_version_family(df1, df2): artifact1 = ln.Artifact.from_dataframe(df1, description="test1").save() artifact2 = ln.Artifact.from_dataframe(df2, description="test2").save() assert ( artifact1.uid[: artifact1._len_stem_uid] != artifact2.uid[: artifact2._len_stem_uid] ) artifact2._add_to_version_family(artifact1) assert ( artifact1.uid[: artifact1._len_stem_uid] == artifact2.uid[: artifact2._len_stem_uid] ) assert ( artifact1.path.name[: artifact1._len_stem_uid] == artifact2.path.name[: artifact2._len_stem_uid] ) artifact1.delete(permanent=True) artifact2.delete(permanent=True) def test_transform_versioning_based_on_key(): transform1 = ln.Transform( key="test-pipeline", version="1.0", source_code="1", kind="pipeline", ).save() assert transform1.is_latest assert transform1.version_tag == "1.0" assert transform1.version == "1.0" with pytest.raises(ValueError) as e: transform2 = ln.Transform( key="test-pipeline", version="1.0", source_code="2", kind="pipeline", ).save() assert ( e.exconly() == "ValueError: Please change the version tag or leave it `None`, '1.0' is already taken" ) transform2 = ln.Transform( key="test-pipeline", # do not pass the version tag, which corresponds to: version=None source_code="2", kind="pipeline", ).save() assert transform2.version_tag is None assert transform2.version == transform2.uid[-4:] # version falls back to uid suffix assert transform2.is_latest assert transform2.hash != transform1.hash assert not ln.Transform.get(key="test-pipeline", version="1.0").is_latest transform3 = ln.Transform( key="test-pipeline", version="abcd", # mimic commit hash source_code="3", kind="pipeline", ).save() assert transform3.version_tag == "abcd" assert transform3.version == "abcd" assert transform3.is_latest assert transform3.hash != transform2.hash assert not ln.Transform.get(key="test-pipeline", source_code="2").is_latest def test_transform_versioning_based_on_revises(): # build one version family transform_v1 = ln.Transform(key="Introduction").save() assert transform_v1.is_latest assert transform_v1.version_tag is None # pass the latest version transform_v2 = ln.Transform( key="Introduction v2", revises=transform_v1, version="2" ).save() assert not transform_v1.is_latest assert transform_v2.is_latest assert transform_v2.uid.endswith("0001") assert transform_v2.version_tag == "2" assert transform_v2.version == "2" # consciously *not* pass the latest version to revises but the previous # it automatically retrieves the latest version transform_v3 = ln.Transform(key="Introduction", revises=transform_v1).save() assert transform_v3.uid.endswith("0002") assert not ln.Transform.get(key="Introduction v2", version="2").is_latest assert transform_v3.is_latest # no source code code was yet saved, returning existing transform with same key transform_v4 = ln.Transform(key="Introduction").save() assert transform_v4 == transform_v3 assert len(ln.Transform.filter(key="Introduction")) == 2 assert len(ln.Transform.filter(key="Introduction").filter(is_latest=True)) == 1 assert ln.Transform.get(key="Introduction") == transform_v3 assert ln.Transform.filter(key="Introduction").get(is_latest=True) == transform_v3 # test get assert ln.Transform.get(transform_v3.uid) == transform_v3 assert ln.Transform.get(transform_v3.id) == transform_v3 assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3 # test empty QuerySet assert ( ln.Transform.filter(key="IntroductionNotExists") .filter(is_latest=True) .one_or_none() is None ) # test soft delete transform_v3.delete() assert transform_v2.is_latest # test hard delete transform_v2.delete(permanent=True) assert ( transform_v1_retrieved := ln.Transform.get(transform_v3.uid[:-4]) ) == transform_v1 assert transform_v1_retrieved.is_latest # test soft delete on the last existing version does not change is_latest transform_v1_retrieved.delete() assert ( transform_v1_retrieved := ln.Transform.get(transform_v1.uid) ) == transform_v1 assert transform_v1_retrieved.is_latest # fully delete transform_v1.delete(permanent=True) # last object that exists is in the trash assert ln.Transform.get(transform_v3.uid[:-4]) == transform_v3 assert transform_v3.branch_id == -1 transform_v3.delete(permanent=True) def test_transform_versioning_across_branches_preserves_main_latest(): main_branch = ln.Branch.get(name="main") ln.setup.switch(main_branch.name) branch = ln.Branch(name="test_versioning_branch_latest").save() transform_v1 = ln.Transform( key="test-branch-aware-is-latest", source_code="main-v1", kind="pipeline", ).save() try: ln.setup.switch(branch.name) transform_v2 = ln.Transform( key="test-branch-aware-is-latest", revises=transform_v1, source_code="feature-v2", kind="pipeline", ).save() transform_v1.refresh_from_db() assert transform_v1.is_latest assert transform_v2.is_latest # Passing an older revises still increments from the family max uid. transform_v3 = ln.Transform( key="test-branch-aware-is-latest", revises=transform_v1, source_code="feature-v3", kind="pipeline", ).save() transform_v2.refresh_from_db() transform_v1.refresh_from_db() assert transform_v3.uid.endswith("0002") assert not transform_v2.is_latest assert transform_v3.is_latest assert transform_v1.is_latest finally: ln.setup.switch(main_branch.name) for uid in (transform_v1.uid[:-4],): for record in ln.Transform.objects.filter(uid__startswith=uid): record.delete(permanent=True) branch.delete(permanent=True) def test_path_rename(): # this is related to renames inside _add_to_version_family with open("test_new_path.txt", "w") as f: f.write("test_new_path") old_path = ln.UPath("s3://lamindata/.lamindb/test_new_path.txt") old_path.upload_from("./test_new_path.txt") assert old_path.exists() new_path = old_path.rename(old_path.with_name("test_new_path2.txt")) assert new_path.exists() assert new_path.as_posix() == "s3://lamindata/.lamindb/test_new_path2.txt" assert not old_path.exists() new_path.unlink() ln.UPath("./test_new_path.txt").unlink() def test_version_backward_compatibility(): """Test that queries using version= still work (backward compatibility).""" # Create transforms with different versions and source_code to avoid deduplication transform1 = ln.Transform( key="test-backward-compat", version="1.0", kind="pipeline", source_code="code1", ).save() transform2 = ln.Transform( key="test-backward-compat", version="2.0", kind="pipeline", source_code="code2", ).save() # Test that we can query using version= (old API) found = ln.Transform.get(key="test-backward-compat", version="1.0") assert found == transform1 assert found.version_tag == "1.0" assert found.version == "1.0" found = ln.Transform.get(key="test-backward-compat", version="2.0") assert found == transform2 assert found.version_tag == "2.0" assert found.version == "2.0" # Test filter with version= results = ln.Transform.filter(key="test-backward-compat", version="1.0") assert len(results) == 1 assert results.first() == transform1 # Test with Artifact artifact1 = ln.Artifact.from_dataframe( pd.DataFrame({"col1": [1, 2]}), key="test-artifact.parquet", version="1.0" ).save() artifact2 = ln.Artifact.from_dataframe( pd.DataFrame({"col1": [3, 4]}), key="test-artifact.parquet", version="2.0" ).save() found_artifact = ln.Artifact.get(key="test-artifact.parquet", version="1.0") assert found_artifact == artifact1 assert found_artifact.version_tag == "1.0" assert found_artifact.version == "1.0" found_artifact = ln.Artifact.get(key="test-artifact.parquet", version="2.0") assert found_artifact == artifact2 assert found_artifact.version_tag == "2.0" assert found_artifact.version == "2.0" # Cleanup transform1.delete(permanent=True) transform2.delete(permanent=True) artifact1.delete(permanent=True) artifact2.delete(permanent=True) def test_adjust_is_latest_when_deleting_is_versioned(): """Direct unit test for _adjust_is_latest_when_deleting_is_versioned (covers multiple promoted).""" # Build two version families, each with v1 (older) and v2 (latest) v1a = ln.Transform(key="Adjust latest family A").save() v2a = ln.Transform(revises=v1a, key="Adjust latest family A").save() v1b = ln.Transform(key="Adjust latest family B").save() v2b = ln.Transform(revises=v1b, key="Adjust latest family B").save() assert v2a.is_latest and v2b.is_latest assert not v1a.is_latest and not v1b.is_latest # Delete both latest → two promoted (covers "new latest ... versions: [...]" branch) promoted = _adjust_is_latest_when_deleting_is_versioned([v2a, v2b]) assert len(promoted) == 2 assert set(promoted) == {v1a.pk, v1b.pk} v1a.refresh_from_db() v1b.refresh_from_db() assert v1a.is_latest and v1b.is_latest # Edge case: empty list returns [] assert _adjust_is_latest_when_deleting_is_versioned([]) == [] # Clean up v2a.delete(permanent=True) v2b.delete(permanent=True) v1a.delete(permanent=True) v1b.delete(permanent=True) ================================================ FILE: tests/core/test_label_manager.py ================================================ from pathlib import Path import bionty as bt import lamindb as ln import pytest from _dataset_fixtures import ( # noqa get_mini_csv, ) from lamindb.errors import ValidationError from lamindb.models.artifact import add_labels @pytest.fixture(scope="module") def adata(): adata = ln.examples.datasets.anndata_with_obs() # add another column adata.obs["cell_type_by_expert"] = adata.obs["cell_type"] adata.obs.loc["obs0", "cell_type_by_expert"] = "B cell" return adata def test_labels_add(adata): label = ln.Record(name="Experiment 1") artifact = ln.Artifact.from_anndata(adata, description="test").save() experiment = ln.Feature(name="experiment", dtype=ln.Record) with pytest.raises(ValueError) as error: artifact.labels.add("experiment_1", experiment) assert ( error.exconly() == "ValueError: Please pass a record (a `SQLRecord` object), not a string, e.g.," " via: label = ln.Record(name='experiment_1')" ) with pytest.raises(ValidationError) as error: artifact.labels.add(label, experiment) assert "not validated. If it looks correct: record.save()" in error.exconly() label.save() with pytest.raises(TypeError) as error: artifact.labels.add(label, "experiment 1") with pytest.raises(ValidationError) as error: artifact.labels.add(label, feature=experiment) assert ( error.exconly() == "lamindb.errors.ValidationError: Feature not validated. If it looks" " correct: ln.Feature(name='experiment', type='cat[Record]').save()" ) experiment.save() # try to pass list of length zero artifact.labels.add([], feature=experiment) # now pass a single label artifact.labels.add(label, feature=experiment) # check that the feature was updated with type = "Record" feature = ln.Feature.get(name="experiment") assert feature._dtype_str == "cat[Record]" with pytest.raises(TypeError): experiments = artifact.labels.get("experiment") # check that the label is there, it's exactly one label with name "Experiment 1" experiments = artifact.labels.get(experiment) assert experiments.one().name == "Experiment 1" # try adding the same label again, nothing should happen artifact.labels.add(label, feature=experiment) # check that the label is there, it's exactly one label with name "Experiment 1" experiments = artifact.labels.get(experiment) assert experiments.get().name == "Experiment 1" # running from_values to load validated label records under the hood experiment = ln.Feature(name="experiment_with_reg", dtype="cat[Record]").save() ln.Record(name="Experiment 2").save() artifact.labels.add("Experiment 2", experiment) experiments = artifact.labels.get(experiment) assert experiments.get().name == "Experiment 2" # now, try adding a new label project = ln.Record(name="project 1").save() ln.Feature(name="project", dtype=ln.Record).save() features = ln.Feature.lookup() artifact.labels.add(project, feature=features.project) # check that the label is there, it's exactly one label with name "Experiment 1" projects = artifact.labels.get(features.project) assert projects.get().name == "project 1" # test add_from adata2 = adata.copy() adata2.uns["mutated"] = True artifact2 = ln.Artifact(adata2, description="My new artifact").save() artifact2.labels.add_from(artifact) experiments = artifact2.labels.get(experiment) assert experiments.get().name == "Experiment 2" artifact2.delete(permanent=True) artifact.delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) ln.Record.filter().delete(permanent=True) def test_labels_add_using_anndata(adata): organism = bt.Organism.from_source(name="mouse") cell_types = [bt.CellType(name=name) for name in adata.obs["cell_type"].unique()] ln.save(cell_types) inspector = bt.CellType.inspect(adata.obs["cell_type_by_expert"].unique()) ln.save([bt.CellType(name=name) for name in inspector.non_validated]) cell_types_from_expert = bt.CellType.from_values( adata.obs["cell_type_by_expert"].unique() ) actual_tissues = [bt.Tissue(name=name) for name in adata.obs["tissue"].unique()] organoid = ln.Record(name="organoid") tissues = actual_tissues + [organoid] ln.save(tissues) # clean up DB state organism_feature = ln.Feature.filter(name="organism").one_or_none() if organism_feature is not None: organism_feature.delete(permanent=True) artifact = ln.Artifact.filter(description="Mini adata").one_or_none() if artifact is not None: artifact.delete(permanent=True, storage=True) ln.Schema.filter().delete(permanent=True) # try to construct without registering metadata features artifact = ln.Artifact.from_anndata(adata, description="Mini adata") if not artifact._state.adding: artifact.delete(permanent=True) # make sure we get a fresh one artifact = ln.Artifact.from_anndata(adata, description="Mini adata") # add feature set without saving file feature_name_feature = ln.Feature(name="feature name", dtype="cat[Record]").save() schema = ln.Schema(features=[feature_name_feature]) with pytest.raises(ValueError) as error: artifact.features._add_schema(schema, slot="random") assert ( error.exconly() == "ValueError: Please save the artifact or collection before adding a feature" " set!" ) # now register features we want to validate # (we are not interested in cell_type_id, here) ln.Feature(name="cell_type", dtype=bt.CellType).save() ln.Feature(name="disease", dtype=ln.Record).save() ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() artifact = ln.Artifact.from_anndata(adata, description="Mini adata") ln.Feature(name="organism", dtype=bt.Organism).save() features = ln.Feature.lookup() with pytest.raises(ValueError) as error: artifact.labels.add(organism, feature=features.organism) assert ( error.exconly() == "ValueError: Please save the artifact/collection before adding a label!" ) artifact.save() # now, we add organism and run checks features = ln.Feature.lookup() with pytest.raises(ln.errors.ValidationError): artifact.labels.add(organism, feature=features.organism) organism.save() artifact.labels.add(organism, feature=features.organism) organism_link = artifact.links_organism.first() assert organism_link.organism.name == "mouse" assert organism_link.feature.name == "organism" feature = ln.Feature.get(name="organism") assert feature._dtype_str == "cat[bionty.Organism]" # now we add cell types & tissues and run checks ln.Feature(name="cell_type", dtype=bt.CellType).save() ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save() add_labels(artifact, cell_types, feature=features.cell_type, from_curator=True) add_labels( artifact, cell_types_from_expert, feature=features.cell_type_by_expert, from_curator=True, ) feature_tissue_simple = ln.Feature(name="tissue_simple", dtype=bt.Tissue).save() with pytest.raises(ValidationError) as err: add_labels(artifact, tissues, feature=feature_tissue_simple, from_curator=True) assert ( err.exconly() == "lamindb.errors.ValidationError: Label type Record is not valid for Feature(name='tissue_simple', dtype='cat[bionty.Tissue]'), consider a feature with dtype='cat[bionty.Tissue|Record]'" ) tissue = ln.Feature(name="tissue", dtype="cat[bionty.Tissue|Record]").save() add_labels(artifact, tissues, feature=tissue, from_curator=True) feature = ln.Feature.get(name="cell_type") assert feature._dtype_str == "cat[bionty.CellType]" feature = ln.Feature.get(name="cell_type_by_expert") assert feature._dtype_str == "cat[bionty.CellType]" feature = ln.Feature.get(name="tissue") assert feature._dtype_str == "cat[bionty.Tissue|Record]" diseases = [ln.Record(name=name) for name in adata.obs["disease"].unique()] ln.save(diseases) add_labels(artifact, diseases, feature=features.disease, from_curator=True) # now, let's add another feature to ext experiment_1 = ln.Record(name="experiment_1").save() ln.Feature(name="experiment", dtype=ln.Record).save() features = ln.Feature.lookup() artifact.labels.add(experiment_1, feature=features.experiment) assert set(artifact.labels.get(features.experiment).to_list("name")) == { "experiment_1" } assert set(artifact.labels.get(features.disease).to_list("name")) == { "chronic kidney disease", "Alzheimer disease", "liver lymphoma", "cardiac ventricle disorder", } assert set(artifact.labels.get(features.organism).to_list("name")) == {"mouse"} assert set( artifact.labels.get(features.tissue)["bionty.Tissue"].to_list("name") ) == { "liver", "heart", "kidney", "brain", } assert set(artifact.labels.get(features.tissue)["Record"].to_list("name")) == { "organoid", } # currently, we can't stratify the two cases below assert set(artifact.labels.get(features.cell_type).to_list("name")) == { "T cell", "my new cell type", "hepatocyte", "hematopoietic stem cell", "B cell", } assert set(artifact.labels.get(features.cell_type, flat_names=True)) == { "T cell", "my new cell type", "hepatocyte", "hematopoietic stem cell", "B cell", } assert set(artifact.labels.get(features.cell_type_by_expert).to_list("name")) == { "T cell", "my new cell type", "hepatocyte", "hematopoietic stem cell", "B cell", } assert experiment_1 in artifact.records.all() # call describe artifact.describe() # clean up artifact.delete(permanent=True) bt.Gene.filter().delete(permanent=True) bt.Organism.filter().delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.CellType.filter().delete(permanent=True) bt.Tissue.filter().delete(permanent=True) bt.Disease.filter().delete(permanent=True) ln.Record.filter().delete(permanent=True) def test_labels_get(get_mini_csv: Path): # noqa: F811 artifact = ln.Artifact(get_mini_csv, description="test") # feature doesn't exist with pytest.raises(TypeError): artifact.labels.get("x") # type: ignore # no linked labels feature_name_feature = ln.Feature(name="feature name", dtype=ln.ULabel).save() schema = ln.Schema(features=[feature_name_feature]).save() artifact.save() # test for deprecated add_schema artifact.features._add_schema(schema, slot="random") assert artifact.schemas.first() == schema artifact.delete(permanent=True, storage=True) schema.delete(permanent=True) feature_name_feature.delete(permanent=True) @pytest.fixture def get_test_artifacts(): with open("./default_storage_unit_core/test-inherit1", "w") as f: f.write("artifact1") with open("./default_storage_unit_core/test-inherit2", "w") as f: f.write("artifact2") artifact1 = ln.Artifact("./default_storage_unit_core/test-inherit1") artifact1.save() artifact2 = ln.Artifact("./default_storage_unit_core/test-inherit2") artifact2.save() yield artifact1, artifact2 artifact1.delete(permanent=True, storage=True) artifact2.delete(permanent=True, storage=True) def test_add_from(get_test_artifacts): artifact1, artifact2 = get_test_artifacts label_names = [f"Project {i}" for i in range(3)] records = [ln.Record(name=label_name) for label_name in label_names] ln.save(records) cell_line_names = [f"Cell line {i}" for i in range(3)] cell_lines = [bt.CellLine(name=name) for name in cell_line_names] ln.save(cell_lines) # pass a list of length 0 artifact2.labels.add([]) # now actually pass the labels artifact2.labels.add(records) # here test add without passing a feature artifact2.labels.add(cell_lines) assert artifact2.cell_lines.count() == len(cell_lines) assert artifact1.records.exists() is False artifact1.labels.add_from(artifact2) assert artifact1.records.count() == artifact2.records.count() assert artifact1.cell_lines.count() == artifact2.cell_lines.count() artifact2.cell_lines.remove(*cell_lines) artifact1.cell_lines.remove(*cell_lines) artifact2.records.remove(*records) artifact1.records.remove(*records) for record in records: record.delete(permanent=True) for cell_line in cell_lines: cell_line.delete(permanent=True) ================================================ FILE: tests/core/test_load.py ================================================ from pathlib import Path import anndata as ad import lamindb as ln import pandas as pd import pytest # ruff: noqa: F811 from _dataset_fixtures import get_small_mdata, get_small_sdata # noqa @pytest.fixture(scope="module") def zip_file(): filepath = Path("test.zip") with open(filepath, "w") as f: f.write("some") yield filepath filepath.unlink() @pytest.fixture(scope="module") def html_filepath(): filepath = Path("./tmp.html") with open(filepath, "w") as f: f.write("

Test

") yield filepath filepath.unlink() @pytest.fixture(scope="module") def json_filepath(): filepath = Path("./tmp.json") with open(filepath, "w") as f: f.write('{"a": 1}') yield filepath filepath.unlink() @pytest.fixture(scope="module") def csv_filepath(): filepath = Path("./tmp.csv") with open(filepath, "w") as f: f.write("a,b\n1,2") yield filepath filepath.unlink() @pytest.fixture(scope="module") def tsv_filepath(): filepath = Path("./tmp.tsv") with open(filepath, "w") as f: f.write("a\tb\n1\t2") yield filepath filepath.unlink() @pytest.fixture(scope="module") def parquet_filepath(): filepath = Path("./tmp.parquet") df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) df.to_parquet(filepath) yield filepath filepath.unlink() @pytest.fixture(scope="module") def yaml_filepath(): filepath = Path("./tmp.yaml") with open(filepath, "w") as f: f.write("a: 1\nb: 2") yield filepath filepath.unlink() @pytest.fixture(scope="module") def image_filepath(): filepath = Path("./tmp.png") with open(filepath, "w") as f: f.write("mock image") yield filepath filepath.unlink() @pytest.fixture(scope="module") def svg_filepath(): filepath = Path("./tmp.svg") with open(filepath, "w") as f: f.write("") yield filepath filepath.unlink() @pytest.fixture(scope="module") def rds_filepath(): filepath = Path("./tmp.rds") with open(filepath, "w") as f: f.write("mock rds") yield filepath filepath.unlink() @pytest.fixture(scope="module") def local_anndata_filepath(): return ln.examples.datasets.anndata_file_pbmc68k_test().resolve() @pytest.fixture(scope="module") def adata(local_anndata_filepath): return ad.read_h5ad(local_anndata_filepath) def test_load_anndata(local_anndata_filepath, adata): artifact = ln.Artifact(local_anndata_filepath, description="test") assert local_anndata_filepath == artifact._local_filepath assert local_anndata_filepath == artifact.path assert local_anndata_filepath == artifact.cache() artifact = ln.Artifact.from_anndata(adata, description="test") assert artifact._memory_rep is adata assert artifact.load() is adata assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path def test_load_mudata(get_small_mdata): artifact = ln.Artifact.from_mudata(get_small_mdata, description="test") assert artifact._memory_rep is get_small_mdata assert artifact.load() is get_small_mdata assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path def test_load_spatialdata(get_small_sdata): artifact = ln.Artifact.from_spatialdata(get_small_sdata, description="test") assert artifact._memory_rep is get_small_sdata assert artifact.load() is get_small_sdata assert artifact._local_filepath.resolve() == artifact.cache() == artifact.path def load_blobs__repr__(): example_blobs_sdata = ln.examples.datasets.spatialdata_blobs() blobs_af = ln.Artifact.from_spatialdata( example_blobs_sdata, key="example_blobs.zarr" ).save() example_blobs_sdata = blobs_af.load() # Must exist and not throw errors assert example_blobs_sdata.__repr__ def test_load_html(html_filepath): artifact = ln.Artifact(html_filepath, key=str(html_filepath)) artifact.load() def test_load_json(json_filepath): artifact = ln.Artifact(json_filepath, key=str(json_filepath)) dictionary = artifact.load() assert dictionary["a"] == 1 def test_no_loader(zip_file): artifact = ln.Artifact(zip_file, key=str(zip_file)) with pytest.raises(NotImplementedError): artifact.load() def test_load_csv(csv_filepath): artifact = ln.Artifact(csv_filepath, key=str(csv_filepath)) df = artifact.load() assert df.iloc[0, 0] == 1 assert df.iloc[0, 1] == 2 def test_load_tsv(tsv_filepath): artifact = ln.Artifact(tsv_filepath, key=str(tsv_filepath)) df = artifact.load() assert df.iloc[0, 0] == 1 assert df.iloc[0, 1] == 2 def test_load_parquet(parquet_filepath): artifact = ln.Artifact(parquet_filepath, key=str(parquet_filepath)) df = artifact.load() assert df.iloc[0, 0] == 1 assert df.iloc[1, 1] == 4 def test_load_yaml(yaml_filepath): artifact = ln.Artifact(yaml_filepath, key=str(yaml_filepath)) data = artifact.load() assert data["a"] == 1 assert data["b"] == 2 def test_load_image(image_filepath): artifact = ln.Artifact(image_filepath, key=str(image_filepath)) result = artifact.load() assert Path(result).name == image_filepath.name def test_load_svg(svg_filepath): artifact = ln.Artifact(svg_filepath, key=str(svg_filepath)) result = artifact.load() assert Path(result).name == svg_filepath.name def test_load_rds(rds_filepath, ccaplog): artifact = ln.Artifact(rds_filepath, key=str(rds_filepath)) result = artifact.load() assert "Please use `laminr` to load `.rds` files" in ccaplog.text assert Path(result).name == rds_filepath.name ================================================ FILE: tests/core/test_manager.py ================================================ import lamindb as ln def test_manager_list(): label = ln.Record(name="manager label") label.save() label_names = [f"Record {i}" for i in range(3)] labels = [ln.Record(name=name) for name in label_names] ln.save(labels) label.parents.set(labels) assert len(label.parents.to_list()) == 3 assert "Record 1" in label.parents.to_list("name") label.delete(permanent=True) for label in labels: label.delete(permanent=True) ================================================ FILE: tests/core/test_merge.py ================================================ """Tests for ln.setup.merge.""" import lamindb as ln import pytest def test_merge_branch_into_main(): """Merge a branch into main: create branch, add ULabel, switch to main, merge.""" branch = ln.Branch(name="test_merge_branch").save() assert branch.status == "standalone" ln.setup.switch(branch.name) assert ln.setup.settings.branch == branch assert ln.setup.settings.branch.name == "test_merge_branch" ulabel = ln.ULabel(name="test_merge_record").save() assert ulabel.branch == branch assert ulabel.created_on == branch # created_on set to creation branch ln.setup.switch("main") assert ln.setup.settings.branch.name == "main" assert ln.setup.settings.branch.status == "standalone" assert ln.ULabel.filter(name="test_merge_record").count() == 0 ln.setup.merge("test_merge_branch") assert ln.ULabel.filter(name="test_merge_record").count() == 1 ulabel = ln.ULabel.get(name="test_merge_record") assert ulabel.branch.name == "main" # created_on still points to the branch on which the record was created assert ulabel.created_on == branch assert ulabel.created_on.name == "test_merge_branch" # merged branch has status "merged" branch.refresh_from_db() assert branch.status == "merged" # this is a merge call to check that branch.describe() works because it # has a custom describe method branch.describe(return_str=True) # Clean up ulabel.delete(permanent=True) branch.delete(permanent=True) ln.setup.switch("main") def test_branch_status_values(): """Branch status maps codes onto standalone/draft/review/merged/closed.""" main_branch = ln.Branch.get(name="main") assert main_branch.status == "standalone" archive_branch = ln.Branch.get(name="archive") assert archive_branch.status == "standalone" trash_branch = ln.Branch.get(name="trash") assert trash_branch.status == "standalone" # User-created branch is standalone by default. branch = ln.Branch(name="test_status_branch").save() assert branch.status == "standalone" branch.status = "draft" branch.save() branch.refresh_from_db() assert branch.status == "draft" branch.status = "review" branch.save() branch.refresh_from_db() assert branch.status == "review" branch.status = "closed" branch.save() branch.refresh_from_db() assert branch.status == "closed" branch.delete(permanent=True) def test_draft_review_and_close_merge_request_status(): branch = ln.Branch(name="test_mr_draft_review_close").save() assert branch.status == "standalone" branch.status = "draft" branch.save() branch.refresh_from_db() assert branch.status == "draft" branch.status = "review" branch.save() branch.refresh_from_db() assert branch.status == "review" branch.status = "closed" branch.save() branch.refresh_from_db() assert branch.status == "closed" branch.delete(permanent=True) def test_merge_nonexistent_branch_raises(): """Merge a non-existent branch raises ObjectDoesNotExist.""" with pytest.raises(ln.errors.ObjectDoesNotExist) as exc_info: ln.setup.merge("nonexistent_branch_xyz") assert "not found" in str(exc_info.value).lower() def test_merge_reconciles_is_latest_for_versioned_records(): main_branch = ln.Branch.get(name="main") ln.setup.switch(main_branch.name) transform_v1 = ln.Transform( key="test-merge-is-latest", source_code="main-v1", kind="pipeline", ).save() branch = ln.Branch(name="test_merge_latest_branch").save() ln.setup.switch(branch.name) transform_v2 = ln.Transform( key="test-merge-is-latest", revises=transform_v1, source_code="feature-v2", kind="pipeline", ).save() transform_v1.refresh_from_db() assert transform_v1.is_latest assert transform_v2.is_latest ln.setup.switch(main_branch.name) ln.setup.merge(branch.name) family = ln.Transform.objects.filter( uid__startswith=transform_v1.uid[:-4], branch_id=1 ) assert family.filter(is_latest=True).count() == 1 assert family.get(is_latest=True).uid == transform_v2.uid for record in family: record.delete(permanent=True) branch.delete(permanent=True) def test_merge_updates_recordblock_branch(): main_branch = ln.Branch.get(name="main") ln.setup.switch(main_branch.name) source_branch = ln.Branch(name="test_merge_recordblock_branch").save() ln.setup.switch(source_branch.name) record = ln.Record(name="recordblock-merge-record").save() block = ln.models.RecordBlock( record=record, content="recordblock merge content", kind="readme", branch=source_branch, created_on=source_branch, ).save() assert block.branch == source_branch assert block.created_on == source_branch ln.setup.switch(main_branch.name) ln.setup.merge(source_branch.name) block.refresh_from_db() assert block.branch.name == "main" assert block.created_on == source_branch record.delete(permanent=True) source_branch.delete(permanent=True) ================================================ FILE: tests/core/test_nbconvert.py ================================================ import os def test_nbconvert(): exit_code = os.system( # noqa: S605 "jupyter nbconvert --to notebook --inplace --execute ./tests/core/notebooks/load_schema.ipynb" ) assert exit_code == 0 ================================================ FILE: tests/core/test_notebooks.py ================================================ import os import subprocess from pathlib import Path import lamindb as ln import nbproject_test notebook_dir = Path(__file__).parent / "notebooks/" notebook_dir_duplicate = Path(__file__).parent / "notebooks/duplicate/" def test_all_notebooks(): nbproject_test.execute_notebooks(notebook_dir) nbproject_test.execute_notebooks(notebook_dir_duplicate) def test_run_after_rename_no_uid(): notebook_path = ( notebook_dir / "with-title-initialized-consecutive-finish-not-last-cell.ipynb" ) result = subprocess.run( # noqa: S602 f"jupyter nbconvert --to notebook --inplace --execute {notebook_path}", shell=True, capture_output=True, ) print(result.stdout.decode()) print(result.stderr.decode()) assert result.returncode == 0 uid = ln.Transform.get( key="with-title-initialized-consecutive-finish-not-last-cell.ipynb" ).uid # now, assume the user renames the notebook new_path = notebook_path.with_name("no-uid-renamed.ipynb") os.system(f"cp {notebook_path} {new_path}") # noqa: S605 result = subprocess.run( # noqa: S602 f"jupyter nbconvert --to notebook --inplace --execute {new_path}", shell=True, capture_output=True, ) print(result.stdout.decode()) print(result.stderr.decode()) assert result.returncode == 0 assert ln.Transform.get(key="no-uid-renamed.ipynb").uid == uid # new_path.unlink() ================================================ FILE: tests/core/test_querydb.py ================================================ import lamindb as ln import pytest def test_DB_multiple_instances(): """Accessing multiple instances simultaneously must work.""" cxg_db = ln.DB("laminlabs/cellxgene") lamindata_db = ln.DB("laminlabs/lamindata") qs1 = cxg_db.Artifact.filter(suffix=".h5ad") qs2 = lamindata_db.Artifact.filter(suffix=".zarr") assert qs1._db != qs2._db def test_DB_bionty(): """Querying a record from bionty must work.""" cxg_db = ln.DB("laminlabs/cellxgene") assert len(cxg_db.bionty.Gene.filter(symbol__startswith="TP53")) > 0 def test_DB_missing_module(): """Attempting to access an attribute that comes from a missing module must error.""" site_assets_db = ln.DB("laminlabs/lamin-site-assets") # instance without bionty with pytest.raises(AttributeError) as e: site_assets_db.bionty.Gene.first() assert ( "Schema 'bionty' not available in instance 'laminlabs/lamin-site-assets'." in str(e.value) ) def test_DB_instantiate_class(): """Attempting to instantiate a class must error.""" cxg_db = ln.DB("laminlabs/cellxgene") with pytest.raises(TypeError) as e: cxg_db.Artifact() assert ( "Cannot instantiate Artifact from DB. Use Artifact.filter(), Artifact.get(), etc. to query records." in str(e.value) ) @pytest.mark.parametrize( "attr,expected_msg", [ ("artifacts", "Registry 'artifacts' not found"), ("foo", "Registry 'foo' not found"), ("celltype", "Registry 'celltype' not found"), ], ) def test_DB_rejects_invalid_attributes(attr, expected_msg): """Accessing invalid attributes must fail.""" cxg_db = ln.DB("laminlabs/cellxgene") with pytest.raises(AttributeError) as e: getattr(cxg_db, attr) assert expected_msg in str(e.value) def test_DB_cache(): """Subsequent accesses must return cached wrapper.""" cxg_db = ln.DB("laminlabs/cellxgene") artifact1 = cxg_db.Artifact artifact2 = cxg_db.Artifact assert artifact1 is artifact2 def test_queryset_caching(): """Calling `.filter()` multiple times should return different results.""" cxg_db = ln.DB("laminlabs/cellxgene") res_1 = cxg_db.Artifact.filter().first() res_2 = cxg_db.Artifact.filter().last() assert res_1 != res_2 def test_DB_dir(): """__dir__ must return discovered registries.""" cxg = ln.DB("laminlabs/cellxgene") dir_result = dir(cxg) assert "Artifact" in dir_result assert "Collection" in dir_result assert "Gene" not in dir_result assert "bionty" in dir_result ================================================ FILE: tests/core/test_queryset.py ================================================ import re import textwrap from contextlib import contextmanager import bionty as bt import lamindb as ln import pytest from django.core.exceptions import FieldError from lamindb.base.users import current_user_id from lamindb.errors import InvalidArgument from lamindb.models import ArtifactSet, BasicQuerySet, QuerySet # please also see the test_curate_df.py tests def test_to_dataframe(): project_label = ln.Record(name="project").save() project_names = [f"Project {i}" for i in range(3)] labels = ln.Record.from_values(project_names, create=True).save() project_label.children.add(*labels) df = ln.Record.to_dataframe(include="parents__name") assert df.columns[2] == "parents__name" assert df["parents__name"].iloc[0] == {project_label.name} df = ln.Record.to_dataframe(include=["parents__name", "parents__created_by_id"]) assert df.columns[3] == "parents__created_by_id" assert df["parents__name"].iloc[0] == {project_label.name} assert set(df["parents__created_by_id"].iloc[0]) == {current_user_id()} # for other models feature_names = [f"Feature {i}" for i in range(3)] features = [ln.Feature(name=name, dtype=int) for name in feature_names] ln.save(features) schema = ln.Schema(features, name="my schema").save() schema.features.set(features) df = ln.Schema.filter(name="my schema").to_dataframe(include="features__name") assert df.columns[2] == "features__name" # order is not conserved assert set(df["features__name"].iloc[0]) == set(feature_names) # pass a list df = ln.Schema.filter(name="my schema").to_dataframe( include=["features__name", "features__created_by_id"] ) assert df.columns[3] == "features__created_by_id" assert set(df["features__name"].iloc[0]) == set(feature_names) assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()} # inner join parents on features df = ln.Schema.filter().to_dataframe( include=["features__name", "features__created_by_id"] ) assert set(df["features__name"].iloc[0]) == set(feature_names) assert set(df["features__created_by_id"].iloc[0]) == {current_user_id()} # raise error for non many-to-many df = ln.Record.filter(name="Project 0").to_dataframe(include="created_by__name") assert df["created_by__name"].iloc[0] == ln.setup.settings.user.name # do not return fields with no data in the registry # does not make sense in Alex's opinion # too much magic; got removed in https://github.com/laminlabs/lamindb/pull/2238 # df = ( # ln.Artifact.connect("laminlabs/cellxgene") # .filter(suffix=".h5ad") # .to_dataframe(include=["tissues__name", "pathways__name"]) # ) # assert "tissues__name" in df.columns # assert "pathways__name" not in df.columns # assert df.shape[0] > 0 # clean up project_label.delete(permanent=True) for label in labels: label.delete(permanent=True) schema.delete(permanent=True) for feature in features: feature.delete(permanent=True) # call it from a non-select-derived queryset qs = ln.User.objects.all() assert qs.to_dataframe().iloc[0]["handle"] == ln.setup.settings.user.handle def test_complex_df_with_features(): # should not fail ln.Artifact.connect("laminlabs/lamindata").to_dataframe(include="features") ln.Run.connect("laminlabs/lamindata").to_dataframe(include="features") ln.Artifact.connect("laminlabs/lamindata").to_dataframe(features="queryset") def test_run_to_dataframe_includes_json_features(): transform = ln.Transform(key="test_run_to_dataframe_includes_json_features").save() run = ln.Run(transform=transform).save() feature = ln.Feature(name="run_json_feature", dtype=str).save() run.features.set_values({"run_json_feature": "hello"}) df = ln.Run.filter(id=run.id).to_dataframe(include="features") assert "run_json_feature" in df.columns assert df["run_json_feature"].iloc[0] == "hello" run.delete(permanent=True) transform.delete(permanent=True) feature.delete(permanent=True) def test_one_first(): qs = ln.User.objects.all() assert qs.one().handle == ln.setup.settings.user.handle assert qs.first().handle == ln.setup.settings.user.handle assert qs.one_or_none().handle == ln.setup.settings.user.handle description = textwrap.dedent("""\ User Simple fields """).strip() assert qs.describe(return_str=True).startswith(description) qs = ln.User.filter(handle="test") with pytest.raises(ln.errors.ObjectDoesNotExist): qs.one() qs = bt.Source.filter() with pytest.raises(ln.errors.MultipleObjectsReturned): qs.one() with pytest.raises(ln.errors.MultipleObjectsReturned): qs.one_or_none() def test_filter_related_field_name(): with pytest.raises( FieldError, match=re.escape( "Invalid lookup 'somelabel' for records. Did you mean records__name?" ), ): ln.Artifact.filter(records="somelabel") def test_filter_unknown_field(): with pytest.raises(InvalidArgument) as e: ln.Artifact.filter(nonexistent="value") assert "You can query either by available fields" in str(e) def test_filter_status_field(): transform = ln.Transform(key="test_filter_status_field").save() run = ln.Run(transform).save() run._status_code = 0 run.save(update_fields=["_status_code"]) assert ln.Run.filter(status="completed").count() >= 1 branch = ln.Branch(name="test_filter_status_branch").save() branch.status = "review" branch.save() assert ln.Branch.filter(status="review").count() >= 1 project = ln.Project(name="test_filter_status_project").save() project._status_code = 2 project.save(update_fields=["_status_code"]) assert ln.Project.filter(status=2).count() >= 1 run.delete(permanent=True) transform.delete(permanent=True) project.delete(permanent=True) branch.delete() def test_get_id_type_error(): with pytest.raises( ValueError, match=re.escape("Field 'id' expected a number but got 'abc'.") ): ln.Artifact.get(id="abc") def test_get_related_field_name(): with pytest.raises( FieldError, match=re.escape( "Invalid lookup 'somelabel' for records. Did you mean records__name?" ), ): ln.Artifact.get(records="somelabel") def test_get_unknown_field(): with pytest.raises(FieldError) as e: ln.Artifact.get(nonexistent="value") assert "Unknown field 'nonexistent'. Available fields:" in str(e) def test_search(): label_names = [f"Record {i}" for i in range(3)] labels = [ln.Record(name=name) for name in label_names] ln.save(labels) qs = ln.Record.filter(name__startswith="Record") assert qs.search("Record 1")[0].name == "Record 1" assert qs.search("Record 1", field=ln.Record.name)[0].name == "Record 1" for label in labels: label.delete(permanent=True) def test_lookup(): qs = ln.User.filter(handle="testuser1") # pass str to field lookup = qs.lookup(field="handle") assert lookup.testuser1.handle == "testuser1" # pass StrField to field lookup = qs.lookup(field=ln.User.handle) assert lookup.testuser1.handle == "testuser1" # manager, default field qsm = ln.User.filter(handle="testuser1") lookup = qsm.lookup() assert lookup.testuser1.handle == "testuser1" def test_inspect(): qs = ln.User.filter(handle="testuser1") assert qs.inspect(["user1", "user2"], "name")["validated"] == [] assert ln.User.inspect(["user1", "user2"], "name")["validated"] == [] assert ln.User.inspect(["user1", "user2"], ln.User.name)["validated"] == [] assert ln.User.inspect("user1", "name")["validated"] == [] def test_validate(): qs = ln.User.filter(handle="testuser1") assert qs.validate(["testuser1", "Test User1"], "handle").tolist() == [True, False] assert ln.User.validate(["testuser1", "Test User1"], "handle").tolist() == [ True, False, ] assert ln.User.validate(["testuser1", "Test User1"], ln.User.handle).tolist() == [ True, False, ] # returns True assert ln.User.validate("testuser1", ln.User.handle) def test_standardize(): qs = ln.User.filter(handle="testuser1") assert qs.standardize(["user1", "user2"]) == ["user1", "user2"] def test_get_doesnotexist_error(): non_existent_label = "some-label-name" with pytest.raises(ln.errors.ObjectDoesNotExist) as excinfo: ln.Record.get(non_existent_label) error_message = str(excinfo.value) assert f"No record found with uid '{non_existent_label}'" in error_message assert ( f"Did you forget a keyword as in Record.get(name='{non_existent_label}')?" in error_message ) @contextmanager def set_branch(branch: ln.Branch): try: ln.setup.settings.branch = branch yield branch finally: ln.setup.settings._branch = None ln.setup.settings._branch_path.unlink(missing_ok=True) def test_get_filter_branch(): branch = ln.Branch(name="test_branch").save() artifact = ln.Artifact.from_dataframe( ln.User.to_dataframe(), key="df_test_get.parquet" ) artifact.branch = branch artifact.save() # switch to branch "test_branch" with set_branch(branch): # errors if doesn't find or multiple records found ln.Artifact.get(key="df_test_get.parquet") assert ln.Artifact.filter(key="df_test_get.parquet").count() == 1 # back to main branch with pytest.raises(ln.errors.ObjectDoesNotExist): ln.Artifact.get(key="df_test_get.parquet") assert ln.Artifact.filter(key="df_test_get.parquet").count() == 0 # test by passing branch directly assert ( ln.Artifact.filter( branch=branch, key="df_test_get.parquet", ).count() == 1 ) assert ( ln.Artifact.filter(branch_id=branch.id, key="df_test_get.parquet").count() == 1 ) assert ( ln.Artifact.filter(ln.Q(branch=branch), key="df_test_get.parquet").count() == 1 ) assert ( ln.Artifact.filter(ln.Q(branch_id=branch.id), key="df_test_get.parquet").count() == 1 ) # errors if doesn't find or multiple records found ln.Artifact.get(key="df_test_get.parquet", branch=branch) ln.Artifact.get(key="df_test_get.parquet", branch_id=branch.id) ln.Artifact.get(key="df_test_get.parquet", branch__in=[branch]) ln.Artifact.get(key="df_test_get.parquet", branch_id__in=[branch.id]) ln.Artifact.get(key="df_test_get.parquet", branch=None) ln.Artifact.get(key="df_test_get.parquet", branch_id=None) ln.Artifact.get(artifact.id) ln.Artifact.get(id=artifact.id) ln.Artifact.get(id__in=[artifact.id]) ln.Artifact.get(artifact.uid[:5]) ln.Artifact.get(uid=artifact.uid) ln.Artifact.get(uid__in=[artifact.uid]) ln.Artifact.get(hash=artifact.hash) ln.Artifact.get(hash__in=[artifact.hash]) artifact.delete(permanent=True) branch.delete() def test_to_class(): qs = ln.Artifact.filter() assert isinstance(qs, QuerySet) assert isinstance(qs, ArtifactSet) qs_copy = qs._to_non_basic(copy=True) assert isinstance(qs_copy, QuerySet) assert isinstance(qs_copy, ArtifactSet) qs_basic = qs._to_basic(copy=True) assert isinstance(qs_basic, BasicQuerySet) assert isinstance(qs_basic, ArtifactSet) assert not isinstance(qs_basic, QuerySet) qs_basic._to_non_basic(copy=False) assert isinstance(qs_basic, QuerySet) assert isinstance(qs_basic, ArtifactSet) def test_queryset_soft_delete_error(): with pytest.raises(ValueError): ln.Storage.filter().delete(permanent=False) with pytest.raises(ValueError): ln.Branch.filter().delete(permanent=False) def test_encode_lamindb_fields_as_columns(): from lamindb.models.query_set import encode_lamindb_fields_as_columns assert encode_lamindb_fields_as_columns( ln.Artifact, ["uid", "name", "created_by", "key", "tissues"] ) == { "uid": "__lamindb_artifact_uid__", "created_by": "__lamindb_artifact_created_by__", "key": "__lamindb_artifact_key__", } assert encode_lamindb_fields_as_columns( ln.Record, ["uid", "name", "created_by", "key", "tissues"] ) == { "uid": "__lamindb_record_uid__", "name": "__lamindb_record_name__", "created_by": "__lamindb_record_created_by__", } # def test_connect_public_clone_instance(): # # become an anonymous user # ln_setup.logout() # try: # from django.db import connections # connections.databases.pop("laminlabs/arc-virtual-cell-atlas", None) # qs = ln.Artifact.connect("laminlabs/arc-virtual-cell-atlas") # assert qs.db == "laminlabs/arc-virtual-cell-atlas" # # Verify the connection is SQLite, not Postgres # assert ( # "sqlite" # in connections.databases["laminlabs/arc-virtual-cell-atlas"]["ENGINE"] # ) # # Verify we can actually query it # result = qs.filter().first() # assert result is not None # finally: # # log back in to ensure that other tests do not break # login_testuser2(session=None) # login_testuser1(session=None) # ln_setup.connect("lamindb-unit-tests-core") ================================================ FILE: tests/core/test_record_basics.py ================================================ import os import re from datetime import date, datetime import bionty as bt import lamindb as ln import pandas as pd import pytest from django.db import IntegrityError from lamindb.errors import FieldValidationError from lamindb.models.record import IMPORTS_UID, SCHEMA_IMPORTS_UID def test_record_docstring_examples(): # create a feature if you don't yet have one gc_content = ln.Feature(name="gc_content", dtype=float).save() # create a record to track a sample sample1 = ln.Record(name="Sample 1", features={"gc_content": 0.5}).save() # describe the record sample1.describe() # create a flexible record type to track experiments experiment_type = ln.Record(name="Experiment", is_type=True).save() experiment1 = ln.Record(name="Experiment 1", type=experiment_type).save() # create a feature to link experiments experiment = ln.Feature(name="experiment", dtype=experiment_type).save() # create a record type to track samples that's constrained with a schema schema = ln.Schema( [experiment, gc_content.with_config(optional=True)], name="sample_schema" ).save() sample_sheet = ln.Record(name="Sample Sheet", is_type=True, schema=schema).save() # group the sample1 record under the sample sheet sample1.type = sample_sheet sample1.save() # reset the feature values for the record including the experiment sample1.features.set_values( { "gc_content": 0.5, "experiment": "Experiment 1", # automatically resolves by name, also accepts the experiment1 object } ) # Export all records under a type to a dataframe df = experiment_type.to_dataframe() assert "Experiment 1" in df["__lamindb_record_name__"].values # If you try to set incomplete features in a record in a sheet, you'll get a validation error sample2 = ln.Record(name="Sample 2", type=sample_sheet).save() with pytest.raises(ln.errors.ValidationError): sample2.features.set_values({"gc_content": 0.6}) # Query records by features assert ln.Record.filter(gc_content=0.5).one() == sample1 assert ln.Record.filter(gc_content__gt=0.4).one() == sample1 assert ln.Record.filter(type=sample_sheet).count() >= 1 # Clean up sample1.delete(permanent=True) sample2.delete(permanent=True) experiment1.delete(permanent=True) sample_sheet.delete(permanent=True) schema.delete(permanent=True) experiment_type.delete(permanent=True) gc_content.delete(permanent=True) experiment.delete(permanent=True) def test_record_initialization(): with pytest.raises( FieldValidationError, match=re.escape( "Only name, type, is_type, features, description, schema, reference, reference_type are valid keyword arguments" ), ): ln.Record(x=1) with pytest.raises(ValueError) as error: ln.Record(1) assert error.exconly() == "ValueError: Only one non-keyword arg allowed" def test_record_lazy_features_on_save(): score_feature = ln.Feature(name="lazy_score", dtype=float).save() record = ln.Record(name="lazy-record", features={"lazy_score": 0.7}).save() assert not hasattr(record, "_features") assert ln.Record.filter(lazy_score=0.7).one().id == record.id record.delete(permanent=True) score_feature.delete(permanent=True) def test_record_from_dataframe_bulk_save_paths(): score = ln.Feature(name="from-df-score", dtype=float).save() schema = ln.Schema([score], name="from-df-schema").save() sheet = ln.Record(name="from-df-sheet", is_type=True, schema=schema).save() df = pd.DataFrame( { "__lamindb_record_name__": ["from-df-a", "from-df-b"], "from-df-score": [1.0, 2.0], } ) records = ln.Record.from_dataframe(df, type=sheet) assert len(records) == 2 records.save() assert ln.Record.get(name="from-df-a").features.get_values()["from-df-score"] == 1.0 df2 = pd.DataFrame( { "__lamindb_record_name__": ["from-df-c"], "from-df-score": [3.0], } ) records_2 = ln.Record.from_dataframe(df2, type=sheet) records_2.save() assert ln.Record.get(name="from-df-c").features.get_values()["from-df-score"] == 3.0 ln.Record.filter(name__in=["from-df-a", "from-df-b", "from-df-c"]).delete( permanent=True ) ln.Record.filter(name="from-df-sheet").delete(permanent=True) schema.delete(permanent=True) score.delete(permanent=True) def test_record_from_dataframe_requires_named_type(): df = pd.DataFrame({"__lamindb_record_name__": ["x"], "score": [1.0]}) non_type_record = ln.Record(name="from-df-non-type").save() unnamed_type = ln.Record(name="from-df-temp-type", is_type=True) unnamed_type.name = None with pytest.raises(ValueError, match="is_type=True"): ln.Record.from_dataframe(df, type=non_type_record) with pytest.raises(ValueError, match="non-null `name`"): ln.Record.from_dataframe(df, type=unnamed_type) non_type_record.delete(permanent=True) def test_record_from_dataframe_with_string_type_creates_import_type(): score = ln.Feature(name="from-df-str-score", dtype=float).save() df = pd.DataFrame( { "__lamindb_record_name__": ["from-df-str-a", "from-df-str-b"], "from-df-str-score": [11.0, 12.0], } ) imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none() original_imports_name = None if imports_type is not None: original_imports_name = imports_type.name imports_type.name = "from-df-renamed-imports-parent" imports_type.save() try: records = ln.Record.from_dataframe(df, type="from-df-str-type") created_type = ln.Record.get(name="from-df-str-type", is_type=True) imports_type = ln.Record.get(uid=IMPORTS_UID) assert len(records) == 2 assert records.type.id == created_type.id assert created_type.type_id == imports_type.id assert created_type.schema.type is not None assert created_type.schema.type.uid == SCHEMA_IMPORTS_UID assert created_type.schema_id is not None records.save() assert ( ln.Record.get(name="from-df-str-a").features.get_values()[ "from-df-str-score" ] == 11.0 ) finally: created_type = ln.Record.filter( name="from-df-str-type", is_type=True ).one_or_none() ln.Record.filter(name__in=["from-df-str-a", "from-df-str-b"]).delete( permanent=True ) ln.Record.filter(name="from-df-str-type").delete(permanent=True) if created_type is not None and created_type.schema_id is not None: ln.Schema.filter(id=created_type.schema_id).delete(permanent=True) if original_imports_name is not None: imports_type = ln.Record.get(uid=IMPORTS_UID) imports_type.name = original_imports_name imports_type.save() score.delete(permanent=True) def test_record_from_dataframe_with_string_type_duplicate_name_errors(): score = ln.Feature(name="from-df-dup-score", dtype=float).save() schema = ln.Schema([score], name="from-df-dup-schema").save() imports_type = ln.Record.filter(uid=IMPORTS_UID).one_or_none() if imports_type is None: imports_type = ln.Record(name="Imports", is_type=True) imports_type.uid = IMPORTS_UID imports_type = imports_type.save() ln.Record( name="from-df-dup-type", is_type=True, schema=schema, type=imports_type ).save() df = pd.DataFrame( { "__lamindb_record_name__": ["from-df-dup-a"], "from-df-dup-score": [21.0], } ) with pytest.raises(ValueError, match="already exists"): ln.Record.from_dataframe(df, type="from-df-dup-type") ln.Record.filter(name="from-df-dup-type").delete(permanent=True) schema.delete(permanent=True) score.delete(permanent=True) def test_feature_manager_raise_not_validated_values(): from lamindb.models._feature_manager import FeatureManager assert FeatureManager._raise_not_validated_values({}) is None with pytest.raises(ln.errors.ValidationError) as error: FeatureManager._raise_not_validated_values( { "Record": ("name", ["missing-record"]), "bionty.Gene": ("symbol", ["missing-gene"]), } ) message = str(error.value) assert "These values could not be validated" in message assert ( "records = ln.Record.from_values(['missing-record'], field='name', create=True).save()" in message ) assert ( "records = bionty.Gene.from_values(['missing-gene'], field='symbol').save()" in message ) def test_name_lookup(): my_type = ln.Record(name="MyType", is_type=True).save() label1 = ln.Record(name="label 1", type=my_type).save() label2 = ln.Record(name="label 1", type=my_type) assert label2 == label1 label2 = ln.Record(name="label 1") assert label2 != label1 label2.save() label3 = ln.Record(name="label 1") assert label3 == label2 label2.delete(permanent=True) label1.delete(permanent=True) my_type.delete(permanent=True) @pytest.mark.skipif( os.getenv("LAMINDB_TEST_DB_VENDOR") == "sqlite", reason="Postgres-only" ) def test_invalid_type_record_with_schema(): schema = ln.Schema(name="test_schema", itype=ln.Feature).save() record_type_with_schema = ln.Record( name="TypeWithSchema", is_type=True, schema=schema ).save() with pytest.raises(IntegrityError) as error: ln.Record(name="InvalidType", is_type=True, type=record_type_with_schema).save() assert "record_type_is_valid_fk" in error.exconly() record_type_with_schema.delete(permanent=True) schema.delete(permanent=True) # see test_artifact_features_add_remove_query in test_artifact_external_features_annotations.py for similar test for Artifacts (populate and query by features) def test_record_features_add_remove_values(): record_type1 = ln.Record(name="RecordType1", is_type=True).save() record_entity1 = ln.Record(name="entity1", type=record_type1).save() record_entity2 = ln.Record(name="entity2", type=record_type1).save() ulabel = ln.ULabel(name="test-ulabel").save() artifact = ln.Artifact(".gitignore", key="test-artifact").save() collection = ln.Collection(artifact, key="test-collection").save() transform = ln.Transform(key="test-transform").save() run = ln.Run(transform, name="test-run").save() feature_bool = ln.Feature(name="feature_bool", dtype=bool).save() feature_str = ln.Feature(name="feature_str", dtype=str).save() feature_list_str = ln.Feature(name="feature_list_str", dtype=list[str]).save() feature_int = ln.Feature(name="feature_int", dtype=int).save() feature_list_int = ln.Feature(name="feature_list_int", dtype=list[int]).save() feature_float = ln.Feature(name="feature_float", dtype=float).save() feature_list_float = ln.Feature(name="feature_list_float", dtype=list[float]).save() feature_num = ln.Feature(name="feature_num", dtype="num").save() feature_url = ln.Feature(name="feature_url", dtype="url").save() feature_list_num = ln.Feature(name="feature_list_num", dtype="list[num]").save() feature_datetime = ln.Feature(name="feature_datetime", dtype=datetime).save() feature_date = ln.Feature(name="feature_date", dtype=datetime.date).save() feature_dict = ln.Feature(name="feature_dict", dtype=dict).save() feature_type1 = ln.Feature(name="feature_type1", dtype=record_type1).save() feature_type1s = ln.Feature(name="feature_type1s", dtype=list[record_type1]).save() feature_user = ln.Feature(name="feature_user", dtype=ln.User).save() feature_ulabel = ln.Feature(name="feature_ulabel", dtype=ln.ULabel).save() feature_project = ln.Feature(name="feature_project", dtype=ln.Project).save() feature_artifact = ln.Feature(name="feature_artifact", dtype=ln.Artifact).save() feature_collection = ln.Feature( name="feature_collection", dtype=ln.Collection ).save() feature_run = ln.Feature(name="feature_run", dtype=ln.Run.uid).save() feature_cell_line = ln.Feature(name="feature_cell_line", dtype=bt.CellLine).save() feature_cell_lines = ln.Feature( name="feature_cell_lines", dtype=list[bt.CellLine] ).save() feature_cl_ontology_id = ln.Feature( name="feature_cl_ontology_id", dtype=bt.CellLine.ontology_id ).save() feature_gene = ln.Feature(name="feature_gene", dtype=bt.Gene).save() test_record = ln.Record(name="test_record").save() test_project = ln.Project(name="test_project").save() hek293 = bt.CellLine.from_source(name="HEK293").save() a549 = bt.CellLine.from_source(name="A-549").save() tmem276 = bt.Gene.from_source(symbol="Tmem276", organism="mouse").save() # test feature.dtype_as_object assert feature_bool.dtype_as_object is bool assert feature_str.dtype_as_object is str assert feature_list_str.dtype_as_object == list[str] assert feature_int.dtype_as_object is int assert feature_list_int.dtype_as_object == list[int] assert feature_float.dtype_as_object is float assert feature_list_float.dtype_as_object == list[float] assert feature_num.dtype_as_object is float assert feature_url.dtype_as_object is str assert feature_list_num.dtype_as_object == list[float] assert feature_datetime.dtype_as_object == datetime assert feature_date.dtype_as_object == date assert feature_dict.dtype_as_object is dict assert feature_type1.dtype_as_object == record_type1 assert feature_type1s.dtype_as_object == list[record_type1] assert feature_user.dtype_as_object == ln.User.handle assert feature_ulabel.dtype_as_object == ln.ULabel.name assert feature_project.dtype_as_object == ln.Project.name assert feature_artifact.dtype_as_object == ln.Artifact.key assert feature_collection.dtype_as_object == ln.Collection.key assert feature_run.dtype_as_object == ln.Run.uid assert feature_cell_line.dtype_as_object == bt.CellLine.name assert feature_cell_lines.dtype_as_object == list[bt.CellLine.name] assert feature_cl_ontology_id.dtype_as_object == bt.CellLine.ontology_id assert feature_gene.dtype_as_object == bt.Gene.symbol # no schema validation test_values = { "feature_bool": True, "feature_str": "00810702-0006", # this string value could be cast to datetime! don't change! "feature_list_str": ["a", "list", "of", "strings"], "feature_int": 42, "feature_list_int": [1, 2, 3], "feature_num": 3.14, "feature_url": "https://lamin.ai/docs", "feature_list_num": [2.71, 3.14, 1.61], "feature_float": 3.14, "feature_list_float": [2.71, 3.14, 1.61], "feature_datetime": datetime(2024, 1, 1, 12, 0, 0), "feature_date": date(2024, 1, 1), "feature_dict": {"key": "value", "number": 123, "list": [1, 2, 3]}, "feature_type1": "entity1", "feature_type1s": ["entity1", "entity2"], "feature_ulabel": "test-ulabel", "feature_user": ln.setup.settings.user.handle, "feature_project": "test_project", "feature_cell_line": "HEK293", "feature_cell_lines": ["HEK293", "A-549"], "feature_gene": "Tmem276", "feature_cl_ontology_id": "CVCL_0045", "feature_artifact": "test-artifact", "feature_collection": "test-collection", "feature_run": run.uid, } test_record.features.add_values(test_values) assert test_record.features.get_values() == test_values # --- Query by features (same data as above) --- # Equality assert ln.Record.filter(feature_str=test_values["feature_str"]).one() == test_record assert ln.Record.filter(feature_int=42).one() == test_record assert ln.Record.filter(feature_type1="entity1").one() == test_record assert ln.Record.filter(feature_cell_line="HEK293").one() == test_record assert ln.Record.filter(feature_url="https://lamin.ai/docs").one() == test_record assert ( ln.Record.filter(feature_str=test_values["feature_str"], feature_int=42).one() == test_record ) # Datetime and date (filter uses ISO strings as stored in JSON) assert ln.Record.filter(feature_datetime="2024-01-01T12:00:00").one() == test_record assert ln.Record.filter(feature_date="2024-01-01").one() == test_record # __contains (categorical) assert ln.Record.filter(feature_cell_line__contains="HEK").one() == test_record assert ln.Record.filter(feature_type1__contains="entity").one() == test_record # Invalid field with pytest.raises(ln.errors.InvalidArgument) as error: ln.Record.filter(feature_str_typo="x", feature_int=42).one() assert error.exconly().startswith( "lamindb.errors.InvalidArgument: You can query either by available fields:" ) # DoesNotExist (no Record named "nonexistent_entity" exists) with pytest.raises(ln.errors.ObjectDoesNotExist) as error: ln.Record.filter(feature_type1="nonexistent_entity").one() assert "Did not find" in error.exconly() # Combined filter (3 keys) assert ( ln.Record.filter( feature_str=test_values["feature_str"], feature_int=42, feature_type1="entity1", ).one() == test_record ) # Bionty: filter by record assert ln.Record.filter(feature_cell_line=hek293).one() == test_record # Bionty: filter by ontology_id string assert ln.Record.filter(feature_cl_ontology_id="CVCL_0045").one() == test_record # Bionty __contains (ontology_id) assert ( ln.Record.filter(feature_cl_ontology_id__contains="0045").one() == test_record ) # DoesNotExist (Record not found: feature_project) with pytest.raises(ln.errors.ObjectDoesNotExist) as error: ln.Record.filter(feature_project="nonexistent_project").one() assert "Did not find" in error.exconly() # __contains returns multiple (add second record, assert, then remove) value_record = ln.Record(name="query_test_value_record").save() value_record.features.add_values({"feature_type1": "entity2"}) assert len(ln.Record.filter(feature_type1__contains="entity")) == 2 value_record.features.remove_values("feature_type1") value_record.delete(permanent=True) # Numeric comparators __lt, __gt (int, float, num) assert ln.Record.filter(feature_int__lt=21).one_or_none() is None assert len(ln.Record.filter(feature_int__gt=21)) >= 1 # int __lt/__gt that would fail with string comparison (42 vs 5, 42 vs 100) assert ln.Record.filter(feature_int__lt=5).one_or_none() is None assert ln.Record.filter(feature_int__gt=100).one_or_none() is None # float/num __lt/__gt (numeric comparison on SQLite via json_extract + CAST) assert ln.Record.filter(feature_float__lt=5.0).one() == test_record assert ln.Record.filter(feature_float__gt=1.0).one() == test_record assert ln.Record.filter(feature_float__gt=10.0).one_or_none() is None assert ln.Record.filter(feature_num__lt=5.0).one() == test_record assert ln.Record.filter(feature_num__gt=1.0).one() == test_record assert ln.Record.filter(feature_num__gt=10.0).one_or_none() is None # Date and datetime comparators (ISO strings) assert ln.Record.filter(feature_date__lt="2024-01-02").one() == test_record assert ln.Record.filter(feature_date__gt="2023-12-31").one() == test_record assert ln.Record.filter(feature_date__gt="2024-01-02").one_or_none() is None assert ( ln.Record.filter(feature_datetime__lt="2024-01-01T13:00:00").one() == test_record ) assert ( ln.Record.filter(feature_datetime__gt="2024-01-01T11:00:00").one() == test_record ) assert ( ln.Record.filter(feature_datetime__lt="2024-01-01T11:00:00").one_or_none() is None ) # ManyToMany accesors assert set(test_record.linked_records.to_list()) == {record_entity1, record_entity2} assert test_record.linked_in_records.count() == 0 assert set(record_entity1.linked_in_records.to_list()) == {test_record} assert set(record_entity2.linked_in_records.to_list()) == {test_record} assert record_entity1.linked_records.count() == 0 assert record_entity2.linked_records.count() == 0 # all empty sheet schema = ln.Schema( [ feature_bool, feature_str, feature_int, feature_list_str, feature_list_int, feature_num, feature_url, feature_float, feature_list_float, feature_list_num, feature_datetime, feature_date, feature_dict, feature_type1, feature_type1s, feature_ulabel, feature_user, feature_project, feature_cell_line, feature_cell_lines, feature_cl_ontology_id, feature_gene, feature_artifact, feature_collection, feature_run, ], name="test_schema", ).save() sheet = ln.Record(name="Sheet", is_type=True, schema=schema).save() empty_record = ln.Record(name="empty_record", type=sheet).save() df_empty = sheet.to_dataframe() assert df_empty["feature_bool"].isnull().all() assert df_empty["feature_bool"].dtype.name == "boolean" assert df_empty["feature_str"].isnull().all() assert df_empty["feature_str"].dtype.name == "string" assert df_empty["feature_int"].isnull().all() assert df_empty["feature_int"].dtype.name == "Int64" assert df_empty["feature_float"].isnull().all() assert df_empty["feature_float"].dtype.name == "float64" assert df_empty["feature_num"].isnull().all() assert df_empty["feature_num"].dtype.name == "float64" assert df_empty["feature_url"].isnull().all() assert df_empty["feature_url"].dtype.name == "string" assert df_empty["feature_list_str"].isnull().all() assert df_empty["feature_list_str"].dtype.name == "object" assert df_empty["feature_list_int"].isnull().all() assert df_empty["feature_list_int"].dtype.name == "object" assert df_empty["feature_datetime"].isnull().all() assert df_empty["feature_datetime"].dtype.name == "datetime64[ns]" assert df_empty["feature_date"].isnull().all() assert df_empty["feature_date"].dtype.name == "object" assert df_empty["feature_dict"].isnull().all() assert df_empty["feature_dict"].dtype.name == "object" assert df_empty["feature_type1"].isnull().all() assert df_empty["feature_type1"].dtype.name == "category" assert df_empty["feature_type1s"].isnull().all() assert df_empty["feature_type1s"].dtype.name == "object" assert df_empty["feature_ulabel"].isnull().all() assert df_empty["feature_ulabel"].dtype.name == "category" assert df_empty["feature_user"].isnull().all() assert df_empty["feature_user"].dtype.name == "category" assert df_empty["feature_project"].isnull().all() assert df_empty["feature_project"].dtype.name == "category" assert df_empty["feature_cell_line"].isnull().all() assert df_empty["feature_cell_line"].dtype.name == "category" assert df_empty["feature_cell_lines"].isnull().all() assert df_empty["feature_cell_lines"].dtype.name == "object" assert df_empty["feature_cl_ontology_id"].isnull().all() assert df_empty["feature_cl_ontology_id"].dtype.name == "category" assert df_empty["feature_artifact"].isnull().all() assert df_empty["feature_artifact"].dtype.name == "category" assert df_empty["feature_collection"].isnull().all() assert df_empty["feature_collection"].dtype.name == "category" assert df_empty["feature_run"].isnull().all() assert df_empty["feature_run"].dtype.name == "category" # remove empty record from sheet empty_record.type = None empty_record.save() # sheet with values test_record.type = sheet test_record.save() df = sheet.to_dataframe() target_result = { "feature_bool": True, "feature_str": "00810702-0006", # this string value could be cast to datetime! "feature_list_str": ["a", "list", "of", "strings"], "feature_int": 42, "feature_list_int": [1, 2, 3], "feature_float": 3.14, "feature_list_float": [2.71, 3.14, 1.61], "feature_num": 3.14, "feature_url": "https://lamin.ai/docs", "feature_list_num": [2.71, 3.14, 1.61], "feature_datetime": pd.Timestamp("2024-01-01 12:00:00"), "feature_date": date(2024, 1, 1), "feature_dict": {"key": "value", "list": [1, 2, 3], "number": 123}, "feature_type1": "entity1", "feature_ulabel": "test-ulabel", "feature_user": ln.setup.settings.user.handle, "feature_project": "test_project", "feature_cell_line": "HEK293", "feature_cl_ontology_id": "CVCL_0045", "feature_gene": "Tmem276", "feature_artifact": "test-artifact", "feature_collection": "test-collection", "feature_run": run.uid, "__lamindb_record_uid__": test_record.uid, "__lamindb_record_name__": "test_record", } result = df.to_dict(orient="records")[0] # need to handle categorical lists differently because # we don't yet respect ordering result_feature_type1s = result.pop("feature_type1s") assert set(result_feature_type1s) == {"entity1", "entity2"} assert isinstance(result_feature_type1s, list) result_feature_cell_lines = result.pop("feature_cell_lines") assert set(result_feature_cell_lines) == {"HEK293", "A-549"} assert isinstance(result_feature_cell_lines, list) assert result == target_result # export to artifact to trigger validation -- this will raise many errors if anything is inconsistent sheet_as_artifact = sheet.to_artifact() # could devise a test for get_values or features.describe() # but this is extensively tested elsewhere # print(sheet_as_artifact.features.get_values()) # assert sheet_as_artifact.features.get_values() sheet_as_artifact.delete(permanent=True) # add the empty record back to the sheet and export again empty_record.type = sheet empty_record.save() df = sheet.to_dataframe() sheet_as_artifact = sheet.to_artifact() sheet_as_artifact.delete(permanent=True) # test passing ISO-format date string for date test_record2 = ln.Record(name="test_record").save() # we could also test different ways of formatting but don't yet do that # in to_dataframe() we enforce ISO format already feature_date = ln.Feature.get(name="feature_date") feature_date.coerce = True # have to allow coercion because we're passing a string feature_date.save() test_values["feature_date"] = "2024-01-02" test_record2.features.add_values(test_values) test_record2.type = sheet test_record2.save() test_values["feature_date"] = date(2024, 1, 2) assert test_record2.features.get_values() == test_values assert test_record.features.get_values() != test_values # also test export to artifact again sheet_as_artifact = sheet.to_artifact() sheet_as_artifact.delete(permanent=True) test_record2.delete(permanent=True) empty_record.delete(permanent=True) # test move a value into the trash record_entity1.delete() test_values.pop("feature_type1") test_values["feature_type1s"] = ["entity2"] test_values["feature_date"] = date(2024, 1, 1) assert test_record.features.get_values() == test_values df = sheet.to_dataframe() result = df.to_dict(orient="records")[0] result_feature_type1s = result.pop("feature_type1s") assert set(result_feature_type1s) == {"entity2"} assert isinstance(result_feature_type1s, list) result_feature_cell_lines = result.pop("feature_cell_lines") assert set(result_feature_cell_lines) == {"HEK293", "A-549"} assert isinstance(result_feature_cell_lines, list) target_result.pop("feature_type1") assert pd.isna(result.pop("feature_type1")) assert result == target_result record_entity1.restore() test_values["feature_type1"] = "entity1" test_values["feature_type1s"] = ["entity1", "entity2"] # remove values test_record.features.remove_values("feature_int") test_values.pop("feature_int") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_date") test_values.pop("feature_date") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_type1") test_values.pop("feature_type1") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_type1s") test_values.pop("feature_type1s") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_ulabel") test_values.pop("feature_ulabel") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_cell_line") test_values.pop("feature_cell_line") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_user") test_values.pop("feature_user") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_artifact") test_values.pop("feature_artifact") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_collection") test_values.pop("feature_collection") assert test_record.features.get_values() == test_values test_record.features.remove_values("feature_run") test_values.pop("feature_run") assert test_record.features.get_values() == test_values # test passing None has no effect, does not lead to annotation sheet.schema = None sheet.save() schema.delete(permanent=True) test_record.features.add_values({"feature_int": None, "feature_type1": None}) assert test_record.features.get_values() == test_values # schema validation feature_str = ln.Feature.get(name="feature_str") feature_int = ln.Feature.get(name="feature_int") schema = ln.Schema([feature_str, feature_int], name="test_schema").save() test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save() test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save() with pytest.raises(ln.errors.ValidationError) as error: test_record_in_form.features.add_values({"feature_type1": "entity1"}) assert "COLUMN_NOT_IN_DATAFRAME" in error.exconly() test_record_in_form.delete(permanent=True) test_form.delete(permanent=True) schema.delete(permanent=True) # test with list of strings schema = ln.Schema([feature_cell_lines], name="test_schema2").save() test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save() test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save() test_record_in_form.features.add_values({"feature_cell_lines": ["HEK293", "A-549"]}) test_record_in_form.delete(permanent=True) test_form.delete(permanent=True) schema.delete(permanent=True) # test with list of records (rather than passing strings) schema = ln.Schema([feature_cell_lines], name="test_schema2").save() test_form = ln.Record(name="TestForm", is_type=True, schema=schema).save() test_record_in_form = ln.Record(name="test_record_in_form", type=test_form).save() test_record_in_form.features.add_values({"feature_cell_lines": [a549, hek293]}) test_record_in_form.delete(permanent=True) test_form.delete(permanent=True) schema.delete(permanent=True) # clean up rest test_record_id = test_record.id assert ln.models.RecordJson.filter(record_id=test_record_id).count() > 0 test_record.delete(permanent=True) # test CASCADE deletion of RecordJson assert ln.models.RecordJson.filter(record_id=test_record_id).count() == 0 sheet.delete(permanent=True) feature_str.delete(permanent=True) feature_list_str.delete(permanent=True) feature_int.delete(permanent=True) feature_list_int.delete(permanent=True) feature_datetime.delete(permanent=True) feature_date.delete(permanent=True) feature_type1.delete(permanent=True) feature_type1s.delete(permanent=True) feature_ulabel.delete(permanent=True) feature_user.delete(permanent=True) feature_project.delete(permanent=True) feature_dict.delete(permanent=True) feature_artifact.delete(permanent=True) feature_run.delete(permanent=True) feature_cell_lines.delete(permanent=True) record_entity1.delete(permanent=True) record_entity2.delete(permanent=True) record_type1.delete(permanent=True) test_project.delete(permanent=True) feature_cell_line.delete(permanent=True) feature_cl_ontology_id.delete(permanent=True) feature_collection.delete(permanent=True) feature_gene.delete(permanent=True) hek293.delete(permanent=True) a549.delete(permanent=True) tmem276.delete(permanent=True) ulabel.delete(permanent=True) collection.delete(permanent=True) artifact.delete(permanent=True) run.delete(permanent=True) transform.delete(permanent=True) feature_num.delete(permanent=True) feature_url.delete(permanent=True) def test_date_and_datetime_corruption(): feature_datetime = ln.Feature( name="feature_datetime", dtype=datetime, coerce=True ).save() feature_date = ln.Feature( name="feature_date", dtype=datetime.date, coerce=True ).save() schema = ln.Schema( [feature_datetime, feature_date], name="test_schema_date_datetime" ).save() test_sheet = ln.Record(name="TestSheet", is_type=True).save() record = ln.Record(name="test_record", type=test_sheet).save() # pass values with Z suffix test_values = { "feature_datetime": "2024-01-01T12:00:00Z", "feature_date": "2025-01-17", } record.features.add_values(test_values) date_value = ln.models.RecordJson.get(record=record, feature=feature_date) # manually corrupt the value date_value.value = "2025-01-17T00:00:00.000Z" date_value.save() assert record.features.get_values() == { "feature_datetime": pd.Timestamp("2024-01-01 12:00:00", tz="UTC"), "feature_date": date(2025, 1, 17), } record.schema = schema record.save() df = test_sheet.to_dataframe() result = df.to_dict(orient="records")[0] # because in a dataframe we'll hit pandera and pandera expects naive # timestamps, to_dataframe() converts to naive by removing timezone info assert result["feature_datetime"] == pd.Timestamp("2024-01-01 12:00:00") assert result["feature_date"] == date(2025, 1, 17) record.delete(permanent=True) test_sheet.delete(permanent=True) schema.delete(permanent=True) feature_datetime.delete(permanent=True) feature_date.delete(permanent=True) def test_only_list_type_features_and_field_qualifiers(): # this test is necessary because the logic for adding link tables # to the query previously only fired when a non-list cat feature of the same type was present feature_cell_lines = ln.Feature( name="feature_cell_lines", dtype=list[bt.CellLine] ).save() feature_list_ontology_id = ln.Feature( name="feature_list_ontology_id", dtype=list[bt.Tissue.ontology_id] ).save() schema = ln.Schema( [feature_cell_lines, feature_list_ontology_id], name="test_schema2" ).save() # create a feature with the same name to test robustness w.r.t. to this feature_type = ln.Feature(name="FeatureTypeX", is_type=True).save() feature_cell_lines_duplicate = ln.Feature( name="feature_cell_lines", dtype=bt.CellLine, type=feature_type ).save() test_sheet = ln.Record(name="TestSheet", is_type=True, schema=schema).save() record = ln.Record(name="test_record", type=test_sheet).save() hek293 = bt.CellLine.from_source(name="HEK293").save() a549 = bt.CellLine.from_source(name="A-549").save() uberon2369 = bt.Tissue.from_source(ontology_id="UBERON:0002369").save() uberon5172 = bt.Tissue.from_source(ontology_id="UBERON:0005172").save() test_values = { "feature_cell_lines": ["HEK293", "A-549"], "feature_list_ontology_id": ["UBERON:0002369", "UBERON:0005172"], } record.features.add_values(test_values) assert record.features.get_values() == test_values df = test_sheet.to_dataframe() result = df.to_dict(orient="records")[0] assert isinstance(result["feature_cell_lines"], list) assert isinstance(result["feature_list_ontology_id"], list) assert set(result["feature_cell_lines"]) == {"HEK293", "A-549"} assert set(result["feature_list_ontology_id"]) == { "UBERON:0002369", "UBERON:0005172", } # add another record record2 = ln.Record(name="test_record2", type=test_sheet).save() test_values2 = { "feature_cell_lines": ["HEK293"], "feature_list_ontology_id": ["UBERON:0005172"], } record2.features.add_values(test_values2) # trigger validation of the case that has two and a single record # this tests type casting in list-like values artifact = test_sheet.to_artifact() assert ( len(artifact.schemas.first().members) == 2 ) # this requires top most match filtering during validation record.delete(permanent=True) record2.delete(permanent=True) test_sheet.delete(permanent=True) inferred_schema = artifact.schemas.first() artifact.delete(permanent=True) inferred_schema.delete(permanent=True) schema.delete(permanent=True) feature_cell_lines.delete(permanent=True) feature_cell_lines_duplicate.delete(permanent=True) feature_type.delete(permanent=True) hek293.delete(permanent=True) a549.delete(permanent=True) uberon2369.delete(permanent=True) uberon5172.delete(permanent=True) def test_record_feature_predicate_query(): age = ln.Feature(name="pred_record_age", dtype=int).save() record_type = ln.Record(name="PredRecordType", is_type=True).save() record_a = ln.Record(name="pred_record_a", type=record_type).save() record_b = ln.Record(name="pred_record_b", type=record_type).save() record_a.features.add_values({"pred_record_age": 42}) record_b.features.add_values({"pred_record_age": 10}) assert ln.Record.filter(age > 40).one() == record_a assert ln.Record.filter(age <= 10).one() == record_b neq_results = ln.Record.filter(age != 42) assert record_b in neq_results assert record_a not in neq_results record_a.delete(permanent=True) record_b.delete(permanent=True) record_type.delete(permanent=True) age.delete(permanent=True) def test_record_features_accept_feature_object_keys(): feature_score = ln.Feature(name="record_feature_object_score", dtype=int).save() feature_tag = ln.Feature(name="record_feature_object_tag", dtype=str).save() record = ln.Record(name="record_feature_object_test").save() record.features.add_values({feature_score: 7, "record_feature_object_tag": "a"}) assert record.features.get_values() == { "record_feature_object_score": 7, "record_feature_object_tag": "a", } # set_values should also accept Feature objects as dictionary keys. record.features.set_values({feature_tag: "b"}) assert record.features.get_values() == {"record_feature_object_tag": "b"} record.features.add_values({feature_score: 9}) assert record.features.get_values() == { "record_feature_object_score": 9, "record_feature_object_tag": "b", } # remove_values supports dictionary inputs with Feature keys. record.features.remove_values({feature_score: 9, feature_tag: None}) assert record.features.get_values() == {} record.delete(permanent=True) feature_score.delete(permanent=True) feature_tag.delete(permanent=True) ================================================ FILE: tests/core/test_record_sheet_examples.py ================================================ import lamindb as ln import pandas as pd from lamindb.examples.fixtures.sheets import ( populate_nextflow_sheet_with_samples, # noqa: F401 populate_sheets_compound_treatment, # noqa: F401 ) def test_float_int_casting(): # this test is only needed for as long as we let JS write data into RecordJson # for JS a 3 is a valid float even though any python json parser interprets it as an int feature_int = ln.Feature(name="feature_int", dtype=int).save() feature_float = ln.Feature(name="feature_float", dtype=float).save() test_schema = ln.Schema([feature_int, feature_float], name="test_schema").save() sheet = ln.Record(name="TestSheet", is_type=True, schema=test_schema).save() record = ln.Record(name="test_record", type=sheet).save() record.features.add_values({"feature_int": 5, "feature_float": 3.0}) record_json = ln.models.RecordJson.get(record=record, feature=feature_float) record_json.value = 3 record_json.save() df = sheet.to_dataframe() assert df["feature_int"].dtype.name == "int64" assert df["feature_float"].dtype.name == "float64" # this export call would error if we didn't have type casting artifact = sheet.to_artifact() related_schemas = list(artifact.schemas.all()) artifact.schemas.clear() artifact.delete(permanent=True) record.delete(permanent=True) sheet.delete(permanent=True) for schema in related_schemas: schema.delete(permanent=True) # schema.delete(permanent=True), not necessary because already deleted above feature_float.delete(permanent=True) feature_int.delete(permanent=True) def test_record_example_compound_treatment( populate_sheets_compound_treatment: tuple[ln.Record, ln.Record], # noqa: F811 ): treatments_sheet, sample_sheet1 = populate_sheets_compound_treatment dictionary = ( ln.Record.filter(type=treatments_sheet) .to_dataframe()[["is_type", "name"]] .to_dict(orient="list") ) assert dictionary == { "is_type": [ False, False, ], "name": [ "treatment2", "treatment1", ], } dictionary = ( ln.Record.filter(type=treatments_sheet) .to_dataframe(features=True)[["compound", "concentration", "name"]] .to_dict(orient="list") ) assert dictionary == { "compound": [ "drug2", "drug1", ], "concentration": [ "4nM", "2nM", ], "name": [ "treatment2", "treatment1", ], } dictionary = ( ln.Record.filter(type=sample_sheet1) .to_dataframe(features=["cell_line", "treatment"])[ ["cell_line", "__lamindb_record_name__", "treatment"] ] .to_dict(orient="list") ) assert dictionary == { "cell_line": [ "HEK293T", "HEK293T", ], "__lamindb_record_name__": [ "sample2", "sample1", ], "treatment": [ "treatment2", "treatment1", ], } assert sample_sheet1.input_of_runs.count() == 0 df = sample_sheet1.to_dataframe() assert sample_sheet1.input_of_runs.count() == 1 assert df.index.name == "__lamindb_record_id__" dictionary = df[ [ "id", # a feature "uid", # a feature "name", # a feature "cell_line", "treatment", "preparation_date", "__lamindb_record_name__", ] ].to_dict(orient="list") assert dictionary == { "id": [1, 2], "uid": ["S1", "S2"], "name": ["Sample 1", "Sample 2"], "cell_line": [ "HEK293T", "HEK293T", ], "preparation_date": [ pd.to_datetime("2025-06-01T05:00:00"), pd.to_datetime("2025-06-01T06:00:00"), ], "treatment": [ "treatment1", "treatment2", ], "__lamindb_record_name__": [ "sample1", "sample2", ], } artifact = sample_sheet1.to_artifact() assert sample_sheet1.schema.members.to_list("name") == [ "id", "uid", "name", "treatment", "cell_line", "preparation_date", "project", ] assert artifact.run.input_records.count() == 1 assert artifact.transform.kind == "function" assert artifact.transform.key == "__lamindb_record_export__" # looks something like this: # id,uid,name,treatment,cell_line,preparation_date,__lamindb_record_uid__,__lamindb_record_name__ # 1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,iCwgKgZELoLtIoGy,sample1 # 2,S2,Sample 2,treatment2,HEK293T,2025-06-01 06:00:00,qvU9m7VF6fSdsqJs,sample2 assert len(artifact.load()) == 2 # two rows in the dataframe assert artifact.path.read_text().startswith("""\ id,uid,name,treatment,cell_line,preparation_date,project,__lamindb_record_uid__,__lamindb_record_name__ 1,S1,Sample 1,treatment1,HEK293T,2025-06-01 05:00:00,Project 1""") assert artifact.key == f"sheet_exports/{sample_sheet1.name}.csv" assert artifact.description.startswith(f"Export of sheet {sample_sheet1.uid}") assert artifact._state.adding is False assert ln.models.ArtifactRecord.filter(artifact=artifact).count() == 2 assert artifact.features.describe(return_str=True).endswith("""\ └── Dataset features └── columns (7) cell_line bionty.CellLine HEK293T id int name str preparation_date datetime project Project Project 1 treatment Record[Treatment] treatment1, treatment2 uid str""") # re-run the export which triggers hash lookup sample_sheet1.to_artifact() # soft-delete a record in the sheet sample_sheet1.records.first().delete() assert ln.Record.filter(type=sample_sheet1).count() == 1 df = sample_sheet1.to_dataframe() print(df) assert len(df) == 1 # one row in the dataframe artifact.delete(permanent=True) def test_nextflow_sheet_with_samples( populate_nextflow_sheet_with_samples: ln.Record, # noqa: F811 ): """Test the example fixture for nextflow sheet with samples.""" # This test is to ensure that the fixture works as expected # and that the data is correctly populated in the database. nextflow_sheet = populate_nextflow_sheet_with_samples df = nextflow_sheet.to_dataframe() assert df[ ["expected_cells", "fastq_1", "fastq_2", "sample", "__lamindb_record_name__"] ].to_dict(orient="list") == { "expected_cells": [ 5000, 5000, 5000, ], "fastq_1": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz", ], "fastq_2": [ "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz", "https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz", ], "__lamindb_record_name__": [ None, None, None, ], "sample": [ "Sample_X", "Sample_Y", "Sample_Y", ], } assert nextflow_sheet.schema is not None artifact = nextflow_sheet.to_artifact() assert artifact.schema is nextflow_sheet.schema assert artifact._state.adding is False assert set(nextflow_sheet.schema.members.to_list("name")) == { "sample", "fastq_1", "fastq_2", "expected_cells", "seq_center", } assert set(artifact.features.slots["columns"].members.to_list("name")) == { "sample", "fastq_1", "fastq_2", "expected_cells", "seq_center", } assert artifact.path.read_text().startswith("""\ sample,fastq_1,fastq_2,expected_cells,seq_center,__lamindb_record_uid__,__lamindb_record_name__ Sample_X,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz,5000,,""") assert artifact.features.describe(return_str=True).endswith("""\ └── Dataset features └── columns (5) expected_cells int fastq_1 str fastq_2 str sample Record[BioSample] Sample_X, Sample_Y seq_center str""") related_schemas = list(artifact.schemas.all()) artifact.schemas.clear() for schema in related_schemas: schema.delete(permanent=True) artifact.delete(permanent=True) def test_record_soft_deleted_recreate(): """Test that a soft-deleted record can be recreated with changes.""" # testing soft delete and recreate with sqlite (postgres is tested in core/test_delete.py) # soft delete a record, then recreate it with some changes record = ln.Record(name="test_record").save() uid = record.uid assert record.branch_id == 1 record.delete() assert record.branch_id == -1 # now recreate the same record with the same uid but a different name record = ln.Record(name="test_record 2") record.uid = uid record.save() # now this record is recovered from the trash assert record.branch_id == 1 assert record.name == "test_record 2" ln.Record.objects.filter().delete() def test_annotate_with_user_feature(): """Test that annotating with a user feature works as expected.""" user_feature = ln.Feature(name="created_by", dtype=ln.User).save() schema = ln.Schema( name="test_schema_user_feature", features=[user_feature], coerce=True, ).save() sheet = ln.Record(name="A sheet with users", is_type=True, schema=schema).save() record = ln.Record(name="first user", type=sheet).save() user = ln.User(uid="abcdefgh", handle="test-user").save() ln.models.RecordUser(record=record, feature=user_feature, value=user).save() df = sheet.to_dataframe() assert df.index.name == "__lamindb_record_id__" assert df.columns.to_list() == [ "created_by", "__lamindb_record_uid__", "__lamindb_record_name__", ] assert df.iloc[0]["created_by"] == "test-user" # clean up record.type = None record.save() record.delete(permanent=True) sheet.delete(permanent=True) schema.delete(permanent=True) user_feature.delete(permanent=True) user.delete(permanent=True) def test_to_artifact_exports_all_records(): # create sheet with >100 records, the default limit for to_dataframe sheet = ln.Record(name="LargeSheet", is_type=True).save() for i in range(101): ln.Record(name=f"record_{i}", type=sheet).save() df = sheet.to_dataframe() assert len(df) == 101, f"Expected 101 records, got {len(df)}" sheet.records.all().delete(permanent=True) sheet.delete(permanent=True) def test_to_artifact_with_required_non_nullable_data_id_maximal_set_true(): feature_data_id = ln.Feature(name="data_id", dtype=str, nullable=False).save() schema = ln.Schema( [feature_data_id], name="schema_with_required_data_id", maximal_set=True, ).save() sheet = ln.Record(name="SheetWithDataId", is_type=True, schema=schema).save() # Name is intentionally omitted to mirror sheet records in real-world pipelines. record = ln.Record(type=sheet).save() record.features.add_values({"data_id": "D1"}) artifact = sheet.to_artifact() df = artifact.load() assert "data_id" in df.columns assert df["data_id"].to_list() == ["D1"] assert "__lamindb_record_name__" in df.columns assert df["__lamindb_record_name__"].isna().all() # clean up record.delete(permanent=True) sheet.delete(permanent=True) artifact.delete(permanent=True) schema.delete(permanent=True) feature_data_id.delete(permanent=True) ================================================ FILE: tests/core/test_rename_features_labels.py ================================================ import datetime import os import lamindb as ln import pandas as pd import pytest def test_rename_feature(ccaplog): df = pd.DataFrame({"old_name": [1, 2]}) ln.Feature(name="old_name", dtype=int).save() artifact = ln.Artifact.from_dataframe( df, key="test.parquet", schema="valid_features" ).save() feature = ln.Feature.get(name="old_name") # First rename feature.name = "new_name" feature.save() now1 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0) assert ( "by renaming feature from 'old_name' to 'new_name' 1 artifact no longer matches the feature name in storage:" in ccaplog.text ) if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite": feature.refresh_from_db() assert feature.synonyms == "old_name" assert feature._aux["renamed"] == { now1.isoformat().replace("+00:00", "Z"): "old_name" } # Second rename feature.name = "newer_name" feature.save() now2 = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0) assert ( "by renaming feature from 'new_name' to 'newer_name' 1 artifact no longer matches the feature name in storage:" in ccaplog.text ) if os.getenv("LAMINDB_TEST_DB_VENDOR") != "sqlite": feature.refresh_from_db() assert feature.synonyms == "old_name|new_name" assert feature._aux["renamed"] == { now1.isoformat().replace("+00:00", "Z"): "old_name", now2.isoformat().replace("+00:00", "Z"): "new_name", } schema = artifact.schemas.first() artifact.delete(permanent=True) schema.delete(permanent=True) feature.delete(permanent=True) @pytest.mark.parametrize("model_class", [ln.ULabel, ln.Record]) def test_rename_label(model_class, ccaplog): df = pd.DataFrame( { "feature1": pd.Categorical(["label1", "label2"]), "feature2": pd.Categorical(["label2", "label2"]), } ) label1 = model_class(name="label1").save() label2 = model_class(name="label2").save() feature1 = ln.Feature(name="feature1", dtype=model_class).save() feature2 = ln.Feature(name="feature2", dtype=model_class).save() artifact = ln.Artifact.from_dataframe( df, key="test.parquet", schema="valid_features" ).save() label = model_class.get(name="label1") label.name = "label-renamed" label.save() assert ( "by renaming label from 'label1' to 'label-renamed' 1 artifact no longer matches the label name in storage:" in ccaplog.text ) schema = artifact.schemas.first() artifact.delete(permanent=True) schema.delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) label1.delete(permanent=True) label2.delete(permanent=True) ================================================ FILE: tests/core/test_run.py ================================================ import time import lamindb as ln import pytest def test_run(): with pytest.raises(ValueError) as error: ln.Run(1, 2) assert error.exconly() == "ValueError: Only one non-keyword arg allowed: transform" with pytest.raises(TypeError) as error: ln.Run() assert error.exconly() == "TypeError: Pass transform parameter" transform = ln.Transform(key="my_transform") with pytest.raises(ValueError) as error: ln.Run(transform) assert ( error.exconly() == "ValueError: Please save transform record before creating a run" ) transform.save() run = ln.Run(transform).save() assert run.status == "scheduled" assert run.reference is None assert run.reference_type is None run2 = ln.Run(transform, reference="test1", reference_type="test2").save() assert run2.reference == "test1" assert run2.reference_type == "test2" assert run.uid != run2.uid run.delete(permanent=True) report_artifact = ln.Artifact( "README.md", kind="__lamindb_run__", description="report of run2" ).save() run2.report = report_artifact environment = ln.Artifact( "CONTRIBUTING.md", kind="__lamindb_run__", description="requirements.txt" ).save() run2.environment = environment run2.save() # report/env artifacts will be cleaned up in background subprocess run2.delete(permanent=True) assert ln.Run.filter(uid=run2.uid).count() == 0 # report/env are still present in the database assert ln.Artifact.filter(uid=report_artifact.uid).count() == 1 assert ln.Artifact.filter(uid=environment.uid).count() == 1 transform.delete(permanent=True) assert ln.Run.filter(uid=run.uid).count() == 0 # wait for background cleanup subprocess to delete artifacts time.sleep(4) assert ln.Artifact.filter(uid=report_artifact.uid).count() == 0 assert ln.Artifact.filter(uid=environment.uid).count() == 0 def test_bulk_permanent_run_delete(tmp_path): transform = ln.Transform(key="Bulk run delete transform").save() n_runs = 2 report_files = [tmp_path / f"report_{i}.txt" for i in range(n_runs)] for i, path in enumerate(report_files): path.write_text(f"content {i}") report_artifacts = [ ln.Artifact(path, kind="__lamindb_run__", description=f"report {i}").save() for i, path in enumerate(report_files) ] runs = [ln.Run(transform, report=af).save() for af in report_artifacts] run_ids = [r.id for r in runs] ln.settings.verbosity = "debug" ln.Run.filter(id__in=run_ids).order_by("created_at").delete(permanent=True) assert ln.Run.filter(id__in=run_ids).count() == 0 assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 1 transform.delete(permanent=True) # wait for background cleanup subprocess to delete artifacts time.sleep(4) assert ln.Artifact.filter(uid=report_artifacts[0].uid).count() == 0 clean_up_logs = ln.setup.settings.cache_dir / f"run_cleanup_logs_{runs[0].uid}.txt" assert f"deleted artifact {report_artifacts[0].id}" in clean_up_logs.read_text() ================================================ FILE: tests/core/test_save.py ================================================ # ruff: noqa: F811 import lamindb as ln import pytest from _dataset_fixtures import ( # noqa get_mini_csv, ) from lamindb.models.save import prepare_error_message, store_artifacts def test_bulk_save_and_update(): label_names = [f"Record {i} new" for i in range(3)] labels = [ln.Record(name=name) for name in label_names] # test bulk creation of new records ln.save(labels) assert len(ln.Record.filter(name__in=label_names).distinct()) == 3 labels[0].name = "Record 0 updated" # test bulk update of existing records ln.save(labels) assert len(ln.Record.filter(name__in=label_names).distinct()) == 2 assert ln.Record.get(name="Record 0 updated") def test_prepare_error_message(get_mini_csv): artifact = ln.Artifact(get_mini_csv, description="test") exception = Exception("exception") error = prepare_error_message([], [artifact], exception) assert error.startswith( "The following entries have been successfully uploaded and committed to the database" ) error = prepare_error_message([artifact], [], exception) assert error.startswith("No entries were uploaded or committed to the database") def test_save_data_object(get_mini_csv): artifact = ln.Artifact(get_mini_csv, description="test") artifact.save() assert artifact.path.exists() artifact.delete(permanent=True, storage=True) def test_store_artifacts_acid(get_mini_csv): artifact = ln.Artifact(get_mini_csv, description="test") artifact._clear_storagekey = "test.csv" # errors on check_and_attempt_clearing with pytest.raises(FileNotFoundError): artifact.save() with pytest.raises(RuntimeError) as error: store_artifacts([artifact], using_key=None) assert str(error.exconly()).startswith( "RuntimeError: The following entries have been successfully uploaded" ) artifact.delete(permanent=True) def test_save_parents(): import bionty as bt bt.CellType.from_values(["B cell", "T cell"]).save() assert bt.CellType.get(name="B cell").parents.to_dataframe().shape[0] == 1 bt.CellType.filter().delete(permanent=True) def test_save_batch_size(): label_names = [f"Record {i} batch_size" for i in range(3)] labels = [ln.Record(name=name) for name in label_names] # test bulk creation of new records with batch size ln.save(labels, batch_size=2) assert ln.Record.filter(name__in=label_names).distinct().count() == 3 def test_bulk_save_lazy_record_features(): cell_type = ln.Record(name="lazy-cell-type", is_type=True).save() ln.Record(name="lazy-b-cell", type=cell_type).save() ln.Record(name="lazy-t-cell", type=cell_type).save() score_feature = ln.Feature(name="lazy-bulk-score", dtype=float).save() cell_feature = ln.Feature(name="lazy-bulk-cell", dtype=cell_type).save() schema = ln.Schema([score_feature, cell_feature], name="lazy-bulk-schema").save() sheet = ln.Record(name="lazy-sheet", is_type=True, schema=schema).save() records = [ ln.Record( name="lazy-sample-1", type=sheet, features={"lazy-bulk-score": 0.1, "lazy-bulk-cell": "lazy-b-cell"}, ), ln.Record( name="lazy-sample-2", type=sheet, features={"lazy-bulk-score": 0.2, "lazy-bulk-cell": "lazy-t-cell"}, ), ] ln.save(records) sample_1 = ln.Record.get(name="lazy-sample-1") sample_2 = ln.Record.get(name="lazy-sample-2") sample_1_values = sample_1.features.get_values() sample_2_values = sample_2.features.get_values() assert sample_1_values["lazy-bulk-score"] == 0.1 assert sample_2_values["lazy-bulk-score"] == 0.2 assert sample_1_values["lazy-bulk-cell"] == "lazy-b-cell" assert sample_2_values["lazy-bulk-cell"] == "lazy-t-cell" assert not hasattr(records[0], "_features") assert not hasattr(records[1], "_features") ln.Record.filter(name__in=["lazy-sample-1", "lazy-sample-2"]).delete(permanent=True) ln.Record.filter(name="lazy-sheet").delete(permanent=True) ln.Record.filter(name__in=["lazy-b-cell", "lazy-t-cell"]).delete(permanent=True) ln.Record.filter(name="lazy-cell-type").delete(permanent=True) schema.delete(permanent=True) score_feature.delete(permanent=True) cell_feature.delete(permanent=True) def test_bulk_save_lazy_record_features_requires_same_schema(): feature_a = ln.Feature(name="lazy-schema-a", dtype=float).save() feature_b = ln.Feature(name="lazy-schema-b", dtype=float).save() schema_a = ln.Schema([feature_a], name="lazy-schema-a").save() schema_b = ln.Schema([feature_b], name="lazy-schema-b").save() type_a = ln.Record(name="lazy-type-a", is_type=True, schema=schema_a).save() type_b = ln.Record(name="lazy-type-b", is_type=True, schema=schema_b).save() records = [ ln.Record(name="lazy-mixed-1", type=type_a, features={"lazy-schema-a": 1.0}), ln.Record(name="lazy-mixed-2", type=type_b, features={"lazy-schema-b": 2.0}), ] with pytest.raises( ln.errors.ValidationError, match="same type schema", ): ln.save(records) ln.Record.filter(name__in=["lazy-mixed-1", "lazy-mixed-2"]).delete(permanent=True) ln.Record.filter(name__in=["lazy-type-a", "lazy-type-b"]).delete(permanent=True) schema_a.delete(permanent=True) schema_b.delete(permanent=True) feature_a.delete(permanent=True) feature_b.delete(permanent=True) def test_bulk_save_lazy_record_features_requires_schema(): unschematized_type = ln.Record(name="lazy-no-schema-type", is_type=True).save() records = [ ln.Record( name="lazy-no-schema-1", type=unschematized_type, features={"foo": 1.0} ) ] with pytest.raises( ln.errors.ValidationError, match="same non-null type schema", ): ln.save(records) ln.Record.filter(name="lazy-no-schema-1").delete(permanent=True) ln.Record.filter(name="lazy-no-schema-type").delete(permanent=True) def test_bulk_resave_trashed_records(): import bionty as bt # first create records from public source records = bt.Ethnicity.from_values(["asian", "white"]).save() assert len(records) == 2 # parents are also created ethnicities = bt.Ethnicity.filter() assert ethnicities.count() > 2 # soft delete the records including parent ethnicities.delete() # then create them again from public source # the new records will now have the same uids as they are hashed from the ontology_ids assert bt.Ethnicity.filter().count() == 0 new_records = bt.Ethnicity.from_values(["asian", "white", "african"]) assert new_records[0].branch_id == 1 assert new_records[0].uid == records[0].uid # after saving, the trashed records should be restored new_records.save() assert new_records[0].branch_id == 1 ethnicities = bt.Ethnicity.filter() # the parent should also be restored assert ethnicities.count() > 3 # clean up ethnicities.delete(permanent=True) ================================================ FILE: tests/core/test_schema.py ================================================ import bionty as bt import lamindb as ln import pandas as pd import pytest from django.db.utils import IntegrityError from lamindb.errors import FieldValidationError, InvalidArgument, ValidationError from lamindb.models.schema import get_related_name, validate_features @pytest.fixture(scope="module") def df(): return pd.DataFrame( { "feat1": [1, 2, 3], "feat2": [3, 4, 5], "feat3": ["cond1", "cond2", "cond2"], "feat4": ["id1", "id2", "id3"], } ) def test_schema_from_values(): gene_symbols = ["TCF7", "MYC"] bt.Gene.filter(symbol__in=gene_symbols).delete(permanent=True) with pytest.raises(ValidationError) as error: schema = ln.Schema.from_values( gene_symbols, bt.Gene.symbol, dtype=int, organism="human" ) assert error.exconly().startswith( "lamindb.errors.ValidationError: These values could not be validated:" ) ln.save(bt.Gene.from_values(gene_symbols, "symbol", organism="human")) schema = ln.Schema.from_values(gene_symbols, bt.Gene.symbol, organism="human") # below should be a queryset and not a list assert set(schema.members) == set( bt.Gene.from_values(gene_symbols, "symbol", organism="human") ) assert schema.dtype == "num" # this is NUMBER_TYPE schema = ln.Schema.from_values( gene_symbols, bt.Gene.symbol, dtype=int, organism="human" ) assert schema._state.adding assert schema.dtype == "int" assert schema.itype == "bionty.Gene" schema.save() assert set(schema.members) == set(schema.genes.all()) id = schema.id # test that the schema is retrieved from the database # in case it already exists schema = ln.Schema.from_values( gene_symbols, bt.Gene.symbol, dtype=int, organism="human" ) assert not schema._state.adding assert id == schema.id schema.delete(permanent=True) # edge cases with pytest.raises(ValueError): schema = ln.Schema.from_values([]) with pytest.raises(TypeError): ln.Schema.from_values(["a"], field="name") with pytest.raises(ValidationError): schema = ln.Schema.from_values( ["weird_name"], field=ln.Feature.name, dtype="float" ) def test_schema_from_records(df): features = ln.Feature.from_dataframe(df) with pytest.raises(ValueError) as error: schema = ln.Schema(features) assert ( error.exconly() == "ValueError: Can only construct feature sets from validated features" ) ln.save(features) schema = ln.Schema(features) assert schema.id is None assert schema._state.adding assert schema.dtype is None assert schema.itype == "Feature" schema.save() # test that the schema is retrieved from the database # in case it already exists schema = ln.Schema(features) assert not schema._state.adding assert schema.id is not None schema.delete(permanent=True) # edge case with pytest.raises(ValueError): positional_arg = 1 ln.Schema(features, positional_arg) def test_schema_from_df(df): # test using type human = bt.Organism.from_source(name="human").save() genes = [bt.Gene(symbol=name, organism=human) for name in df.columns] ln.save(genes) with pytest.raises(ValueError) as error: ln.Schema.from_dataframe(df, field=bt.Gene.symbol) assert error.exconly().startswith("ValueError: data types are heterogeneous:") schema = ln.Schema.from_dataframe(df[["feat1", "feat2"]], field=bt.Gene.symbol) for gene in genes: gene.delete(permanent=True) # now for the features registry features = ln.Feature.from_dataframe(df) ln.save(features) schema = ln.Schema.from_dataframe(df).save() assert schema.dtype is None ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) def test_get_related_name(): with pytest.raises(ValueError): get_related_name(ln.Transform) def test_validate_features(): with pytest.raises(ValueError): validate_features([]) with pytest.raises(TypeError): validate_features(["feature"]) with pytest.raises(TypeError): validate_features({"feature"}) transform = ln.Transform(key="test").save() # This is just a type check with pytest.raises(TypeError) as error: validate_features([transform, ln.Run(transform)]) assert error.exconly() == "TypeError: schema can only contain a single type" transform.delete(permanent=True) def test_kwargs(): with pytest.raises(FieldValidationError): ln.Schema(x="1", features=[]) def test_edge_cases(): feature = ln.Feature(name="rna", dtype="float") ln.save([feature]) with pytest.raises(ValueError) as error: ln.Schema(feature) assert ( error.exconly() == "ValueError: Please pass a ListLike of features, not a single feature" ) feature.delete(permanent=True) @pytest.fixture(scope="module") def mini_immuno_schema_flexible(): schema = ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible() yield schema ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.Gene.filter().delete(permanent=True) ln.Record.filter(type__isnull=False).delete(permanent=True) ln.Record.filter().delete(permanent=True) bt.CellType.filter().delete(permanent=True) def test_schema_update_implicit_through_name_equality( mini_immuno_schema_flexible: ln.Schema, ccaplog, ): df = pd.DataFrame({"a": [1]}) artifact = ln.Artifact.from_dataframe(df, key="test_artifact.parquet").save() artifact.schema = mini_immuno_schema_flexible artifact.save() orig_hash = mini_immuno_schema_flexible.hash warning_message = "you updated the schema hash and might invalidate datasets that were previously validated with this schema:" # different numbers of features ------------------------------------------- schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="donor"), ], ).save() assert schema.hash != orig_hash assert ccaplog.text.count(warning_message) == 1 # change is flexible (an auxiliary field) -------------------------------- schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="cell_type_by_model"), ln.Feature.get(name="assay_oid"), ln.Feature.get(name="donor"), ln.Feature.get(name="concentration"), ln.Feature.get(name="treatment_time_h"), ], flexible=True, ).save() assert schema.hash == orig_hash # restored original hash assert ccaplog.text.count(warning_message) == 2 # warning raised schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="cell_type_by_model"), ln.Feature.get(name="assay_oid"), ln.Feature.get(name="donor"), ln.Feature.get(name="concentration"), ln.Feature.get(name="treatment_time_h"), ], flexible=False, ).save() assert schema.hash != orig_hash assert ccaplog.text.count(warning_message) == 3 # warning raised ln.examples.datasets.mini_immuno.define_mini_immuno_schema_flexible() artifact.delete(permanent=True) # restore original hash -------------------------------- schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="cell_type_by_model"), ln.Feature.get(name="assay_oid"), ln.Feature.get(name="donor"), ln.Feature.get(name="concentration"), ln.Feature.get(name="treatment_time_h"), ], flexible=True, ).save() assert schema.hash == orig_hash # restored original hash def test_schema_update( mini_immuno_schema_flexible: ln.Schema, ccaplog, ): df = pd.DataFrame({"a": [1]}) artifact = ln.Artifact.from_dataframe(df, key="test_artifact.parquet").save() artifact.schema = mini_immuno_schema_flexible artifact.save() # store original hash orig_hash = mini_immuno_schema_flexible.hash warning_message = "you updated the schema hash and might invalidate datasets that were previously validated with this schema:" # add a feature ------------------------------------------- feature_to_add = ln.Feature(name="sample_note", dtype=str).save() assert mini_immuno_schema_flexible.n_members == 6 mini_immuno_schema_flexible.features.add(feature_to_add) mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash != orig_hash assert mini_immuno_schema_flexible.n_members == 7 assert ccaplog.text.count(warning_message) == 1 # remove the feature again mini_immuno_schema_flexible.features.remove(feature_to_add) mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash == orig_hash assert ccaplog.text.count(warning_message) == 2 assert mini_immuno_schema_flexible.n_members == 6 feature_to_add.delete(permanent=True) # change is flexible (an auxiliary field) -------------------------------- assert mini_immuno_schema_flexible.flexible mini_immuno_schema_flexible.flexible = False mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash != orig_hash assert ccaplog.text.count(warning_message) == 3 # restore original setting mini_immuno_schema_flexible.flexible = True mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash == orig_hash assert ccaplog.text.count(warning_message) == 4 # change coerce (formerly auxiliary field, now Django field) -------------------------------- assert not mini_immuno_schema_flexible.coerce mini_immuno_schema_flexible.coerce = True mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash != orig_hash assert ccaplog.text.count(warning_message) == 5 # restore original setting mini_immuno_schema_flexible.coerce = False mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash == orig_hash assert ccaplog.text.count(warning_message) == 6 # add an index -------------------------------- index_feature = ln.Feature(name="immuno_sample", dtype=str).save() mini_immuno_schema_flexible.index = index_feature mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash != orig_hash assert mini_immuno_schema_flexible.n_members == 7 assert ccaplog.text.count(warning_message) == 7 # remove the index mini_immuno_schema_flexible.index = None mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.n_members == 6 assert mini_immuno_schema_flexible.hash == orig_hash assert ccaplog.text.count(warning_message) == 8 index_feature.delete(permanent=True) # make a feature optional -------------------------------- required_feature = mini_immuno_schema_flexible.features.first() mini_immuno_schema_flexible.optionals.add(required_feature) mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash != orig_hash assert ccaplog.text.count(warning_message) == 9 # make it required again mini_immuno_schema_flexible.optionals.remove(required_feature) mini_immuno_schema_flexible.save() assert mini_immuno_schema_flexible.hash == orig_hash assert ccaplog.text.count(warning_message) == 10 artifact.delete(permanent=True) def test_schema_mutations_feature_removal( mini_immuno_schema_flexible: ln.Schema, ccaplog ): feature1 = ln.Feature.get(name="perturbation") feature2 = ln.Feature.get(name="cell_type_by_model") dummy_artifact = ln.Artifact(".gitignore", key=".gitignore").save() # define the schema the first time schema = ln.Schema(name="My test schema X", features=[feature1, feature2]).save() assert schema.features.count() == 2 dummy_artifact.schema = schema # pretend artifact was validated with this schema dummy_artifact.save() # define the schema the first time schema1 = ln.Schema(name="My test schema X", features=[feature2]).save() # retrieves same schema because of name equality assert ccaplog.text.count("you're removing these features:") == 1 assert ( ccaplog.text.count("you updated the schema hash and might invalidate datasets") == 1 ) assert schema1 == schema assert schema1.features.count() == 1 dummy_artifact.delete(permanent=True) schema.delete(permanent=True) def test_schema_add_remove_optional_features(mini_immuno_schema_flexible: ln.Schema): schema = mini_immuno_schema_flexible initial_hash = schema.hash feature_project = ln.Feature(name="project", dtype=ln.Project).save() schema.add_optional_features([feature_project]) assert schema.hash != initial_hash schema.remove_optional_features([feature_project]) assert schema.hash == initial_hash def test_schema_components(mini_immuno_schema_flexible: ln.Schema): obs_schema = mini_immuno_schema_flexible var_schema = ln.Schema( name="scRNA_seq_var_schema", itype=bt.Gene.ensembl_gene_id, dtype="num", ).save() # test recreation of schema based on name lookup var_schema2 = ln.Schema( name="scRNA_seq_var_schema", itype=bt.Gene.ensembl_gene_id, dtype="num", ).save() assert var_schema == var_schema2 with pytest.raises(InvalidArgument) as error: ln.Schema( name="mini_immuno_anndata_schema", slots={"obs": obs_schema, "var": var_schema}, ).save() assert str(error.value) == "Please pass otype != None for composite schemas" anndata_schema = ln.Schema( name="mini_immuno_anndata_schema", otype="AnnData", slots={"obs": obs_schema, "var": var_schema}, ).save() var_schema2 = ln.Schema( name="symbol_var_schema", itype=bt.Gene.symbol, dtype="num", ).save() # try adding another schema under slot "var" # we want to trigger the unique constraint on slot with pytest.raises(IntegrityError) as error: anndata_schema.components.add( # type: ignore var_schema2, through_defaults={"slot": "var"} ) assert "unique" in str(error.value).lower() anndata_schema.delete(permanent=True) var_schema2.delete(permanent=True) var_schema.delete(permanent=True) def test_mini_immuno_schema_flexible(mini_immuno_schema_flexible): schema = ln.Schema( name="Mini immuno schema", features=[ ln.Feature.get(name="perturbation"), ln.Feature.get(name="cell_type_by_model"), ln.Feature.get(name="assay_oid"), ln.Feature.get(name="donor"), ln.Feature.get(name="concentration"), ln.Feature.get(name="treatment_time_h"), ], flexible=True, # _additional_ columns in a dataframe are validated & annotated ) assert schema.name == "Mini immuno schema" assert schema.itype == "Feature" assert ( schema._list_for_hashing[:6] == [ "b=Feature", "c=True", "d=False", "e=False", "f=True", "h=6", "j=HASH_OF_FEATURE_UIDS", # this last hash is not deterministic in a unit test ][:6] ) def test_schema_recovery_based_on_hash(mini_immuno_schema_flexible: ln.Schema): feature1 = ln.Feature.get(name="perturbation") feature2 = ln.Feature.get(name="cell_type_by_model") schema = ln.Schema(features=[feature1, feature2]).save() schema2 = ln.Schema(features=[feature1, feature2]) assert schema == schema2 schema.delete() schema2 = ln.Schema(features=[feature1, feature2]) assert schema != schema2 schema.delete(permanent=True) def test_schemas_dataframe(): # test on the Python level after record creation -- no saving! schema = ln.Schema(name="valid_features", itype=ln.Feature) assert schema.name == "valid_features" assert schema.itype == "Feature" assert schema._list_for_hashing == [ "b=Feature", "c=True", "d=False", "e=False", ] assert schema.hash == "kMi7B_N88uu-YnbTLDU-DA" # test the convenience function schema = ln.examples.schemas.valid_features() assert schema.uid == "0000000000000000" assert schema.name == "valid_features" assert schema.itype == "Feature" assert schema.hash == "kMi7B_N88uu-YnbTLDU-DA" def test_schemas_anndata(): # test on the Python level after record creation -- no saving! obs_schema = ln.examples.schemas.valid_features() varT_schema = ln.Schema( name="valid_ensembl_gene_ids", itype=bt.Gene.ensembl_gene_id ) assert varT_schema._list_for_hashing == [ "a=num", "b=bionty.Gene.ensembl_gene_id", "c=True", "d=False", "e=False", ] assert varT_schema.name == "valid_ensembl_gene_ids" assert varT_schema.itype == "bionty.Gene.ensembl_gene_id" assert varT_schema.hash == "1gocc_TJ1RU2bMwDRK-WUA" schema = ln.Schema( name="anndata_ensembl_gene_ids_and_valid_features_in_obs", otype="AnnData", slots={"obs": obs_schema, "var.T": varT_schema.save()}, ) assert schema._list_for_hashing == [ "a=num", "c=True", "d=False", "e=False", "l=GPZ-TzvKRhdC1PQAhlFiow", ] assert schema.name == "anndata_ensembl_gene_ids_and_valid_features_in_obs" assert schema.itype is None assert schema.hash == "aqGWHvyY49W_PHELUMiBMw" # test the convenience function schema = ln.examples.schemas.anndata_ensembl_gene_ids_and_valid_features_in_obs() assert schema.uid == "0000000000000002" assert schema.name == "anndata_ensembl_gene_ids_and_valid_features_in_obs" assert schema.itype is None assert schema.hash == "aqGWHvyY49W_PHELUMiBMw" varT_schema = schema.slots["var.T"] assert varT_schema.uid == "0000000000000001" assert varT_schema.name == "valid_ensembl_gene_ids" assert varT_schema.itype == "bionty.Gene.ensembl_gene_id" assert varT_schema.hash == "1gocc_TJ1RU2bMwDRK-WUA" schema.delete(permanent=True) def test_schema_already_saved_aux(): """When attempting to save a Schema that was already saved before which populated `_aux` fields, we expect the Schema to be returned with the same `_aux` fields. Test for https://github.com/laminlabs/lamindb/issues/2887 """ var_schema = ln.Schema( name="test var", index=ln.Feature( name="var_index", dtype=bt.Gene.ensembl_gene_id, cat_filters={ "source": bt.Source.get( entity="bionty.Gene", currently_used=True, organism="human" ) }, ).save(), itype=ln.Feature, dtype="DataFrame", minimal_set=True, coerce=True, ).save() schema = ln.Schema( name="AnnData schema", otype="AnnData", minimal_set=True, coerce=True, slots={"var": var_schema}, ).save() # _aux["af"] now only contains key "3" (index_feature_uid) since coerce and flexible are Django fields assert len(schema.slots["var"]._aux["af"].keys()) == 1 assert "3" in schema.slots["var"]._aux["af"] # index_feature_uid # coerce and flexible are now proper Django fields assert schema.slots["var"].coerce is True assert schema.slots["var"].flexible is False # Attempting to save the same schema again should return the Schema with the same fields var_schema_2 = ln.Schema( name="test var", index=ln.Feature( name="var_index", dtype=bt.Gene.ensembl_gene_id, cat_filters={ "source": bt.Source.get( entity="bionty.Gene", currently_used=True, organism="human" ) }, ).save(), itype=ln.Feature, dtype="DataFrame", minimal_set=True, coerce=True, ).save() schema_2 = ln.Schema( name="AnnData schema", otype="AnnData", minimal_set=True, coerce=True, slots={"var": var_schema_2}, ).save() assert len(schema.slots["var"]._aux["af"].keys()) == 1 assert schema.slots["var"]._aux == schema_2.slots["var"]._aux assert schema.slots["var"].coerce == schema_2.slots["var"].coerce assert schema.slots["var"].flexible == schema_2.slots["var"].flexible schema_2.delete(permanent=True) schema.delete(permanent=True) def test_schema_not_saved_describe(): schema = ln.Schema(name="NotSavedSchema", is_type=True) with pytest.raises(ValueError) as e: schema.describe() assert "Schema must be saved before describing" in str(e.value) def test_schema_is_type(): Sample = ln.Schema(name="Sample", is_type=True).save() assert Sample.hash is None BioSample = ln.Schema(name="BioSample", is_type=True, type=Sample).save() assert BioSample.hash is None assert BioSample.type == Sample assert BioSample.is_type # create a schema without any features or slots or itype or is_type=True with pytest.raises(InvalidArgument) as e: ln.Schema(name="TechSample", type=Sample) assert "Please pass features or slots or itype or set is_type=True" in str(e.value) # clean up BioSample.delete(permanent=True) Sample.delete(permanent=True) # see test_component_composite in test_transform.py def test_composite_component(): composite = ln.Schema(name="composite", itype=ln.Feature).save() component1 = ln.Schema(name="component1", itype=bt.CellType).save() component2 = ln.Schema(name="component2", itype=bt.CellMarker).save() composite.components.add(component1, through_defaults={"slot": "slot1"}) composite.components.add(component2, through_defaults={"slot": "slot2"}) assert len(composite.components.all()) == 2 assert composite.links_component.count() == 2 assert set(composite.links_component.all().to_list("slot")) == {"slot1", "slot2"} assert composite.links_component.first().composite == composite assert composite.composites.count() == 0 assert composite.links_composite.count() == 0 ln.models.SchemaComponent.filter(composite=composite).delete(permanent=True) link = ln.models.SchemaComponent( composite=composite, component=component1, slot="var" ).save() assert link in composite.links_component.all() assert link in component1.links_composite.all() assert link.slot == "var" composite.delete(permanent=True) component1.delete(permanent=True) component2.delete(permanent=True) assert ln.models.SchemaComponent.filter().count() == 0 def test_schema_describe_bracket_names(): """Feature names with brackets like 'characteristics[organism]' must appear verbatim in describe output. Regression test for Rich interpreting '[...]' as markup tags and swallowing bracket content. """ features = [ ln.Feature(name="source name", dtype="str").save(), ln.Feature(name="characteristics[organism]", dtype="str").save(), ln.Feature(name="characteristics[disease]", dtype="str").save(), ln.Feature(name="comment[instrument]", dtype="str").save(), ] schema = ln.Schema(features, name="test_brackets").save() result = schema.describe(return_str=True) assert "characteristics[organism]" in result assert "characteristics[disease]" in result assert "comment[instrument]" in result schema.delete(permanent=True) for feature in features: feature.delete(permanent=True) ================================================ FILE: tests/core/test_search.py ================================================ import bionty as bt import lamindb as ln import pytest @pytest.fixture(scope="module") def prepare_cell_type_registry(): bt.CellType.filter().delete(permanent=True) records = [ { "ontology_id": "CL:0000084", "name": "T cell", "synonyms": "T-cell|T-lymphocyte|T lymphocyte", "children": ["CL:0000798", "CL:0002420", "CL:0002419", "CL:0000789"], }, { "ontology_id": "CL:0000236", "name": "B cell", "synonyms": "B-lymphocyte|B lymphocyte|B-cell", "children": ["CL:0009114", "CL:0001201"], }, { "ontology_id": "CL:0000696", "name": "PP cell", "synonyms": "type F enteroendocrine cell", "children": ["CL:0002680"], }, { "ontology_id": "CL:0002072", "name": "nodal myocyte", "synonyms": "P cell|myocytus nodalis|cardiac pacemaker cell", "children": ["CL:1000409", "CL:1000410"], }, ] public_records = [] for ref_record in records: record = bt.CellType.from_source(ontology_id=ref_record["ontology_id"]) assert record.name == ref_record["name"] assert set(record.synonyms.split("|")) == set(ref_record["synonyms"].split("|")) public_records.append(record) ln.save(public_records) yield "prepared" bt.CellType.filter().delete(permanent=True) def test_search_synonyms(prepare_cell_type_registry): result = bt.CellType.search("P cell").to_dataframe() assert set(result.name.iloc[:2]) == {"nodal myocyte", "PP cell"} def test_search_limit(prepare_cell_type_registry): result = bt.CellType.search("P cell", limit=1).to_dataframe() assert len(result) == 1 def test_search_case_sensitive(prepare_cell_type_registry): result = bt.CellType.search("b cell", case_sensitive=False).to_dataframe() assert result.name.iloc[0] == "B cell" def test_search_None(): with pytest.raises( ValueError, match="Cannot search for None value! Please pass a valid string." ): bt.CellType.search(None) ================================================ FILE: tests/core/test_settings.py ================================================ import lamindb as ln import pytest def test_settings_repr(): repr_str = repr(ln.settings) lines = repr_str.split("\n") assert "Settings" in lines[0] assert all(line.startswith(" ") for line in lines[1:]) content = "\n".join(lines[1:]) assert content.find("instance:") < content.find("storage:") assert content.find("storage:") < content.find("verbosity:") assert content.find("verbosity:") < content.find("track_run_inputs:") def test_storage_setter_raises_on_foreign_managed_storage(tmp_path): storage = ln.Storage(root=(tmp_path / "foreign-managed-storage").as_posix()).save() storage.instance_uid = "_not_exists_" storage.save() with pytest.raises(ValueError) as error: ln.settings.storage = storage.root assert ( error.exconly() == f"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here." ) storage.delete() def test_local_storage_setter_raises_on_foreign_managed_storage(tmp_path): storage = ln.Storage( root=(tmp_path / "foreign-managed-local-storage").as_posix() ).save() storage.instance_uid = "_not_exists_" storage.save() with pytest.raises(ValueError) as error: ln.settings.local_storage = storage.root assert ( error.exconly() == f"ValueError: Storage '{storage.root}' exists in another instance (_not_exists_), cannot write to it from here." ) storage.delete() ================================================ FILE: tests/core/test_sqlrecord.py ================================================ import re import shutil import textwrap from pathlib import Path import bionty as bt import lamindb as ln import pandas as pd import pytest from lamindb.errors import FieldValidationError from lamindb.models.sqlrecord import ( _get_record_kwargs, _search, get_name_field, suggest_records_with_similar_names, ) def test_feature_describe(): description = textwrap.dedent("""\ Feature Simple fields .uid: CharField .name: CharField .unit: CharField .description: TextField .array_rank: SmallIntegerField .array_size: IntegerField .array_shape: JSONField .synonyms: TextField .default_value: JSONField .nullable: BooleanField .coerce: BooleanField .is_type: BooleanField .is_locked: BooleanField .created_at: DateTimeField .updated_at: DateTimeField Relational fields .branch: Branch .created_on: Branch .space: Space .created_by: User .run: Run .type: Feature .schemas: Schema .features: Feature .values: JsonValue .projects: Project .ablocks: FeatureBlock """).strip() assert description == ln.Feature.describe(return_str=True) def test_artifact_describe(): description = textwrap.dedent("""\ Artifact Simple fields .uid: CharField .key: CharField .description: TextField .suffix: CharField .kind: CharField .otype: CharField .size: BigIntegerField .hash: CharField .n_files: BigIntegerField .n_observations: BigIntegerField .version_tag: CharField .is_latest: BooleanField .is_locked: BooleanField .created_at: DateTimeField .updated_at: DateTimeField Relational fields .branch: Branch .created_on: Branch .space: Space .storage: Storage .run: Run .schema: Schema .created_by: User .input_of_runs: Run .recreating_runs: Run .schemas: Schema .json_values: JsonValue .artifacts: Artifact .linked_in_records: Record .users: User .runs: Run .linked_by_runs: Run .ulabels: ULabel .linked_by_artifacts: Artifact .collections: Collection .records: Record .references: Reference .projects: Project .ablocks: ArtifactBlock Bionty fields .organisms: bionty.Organism .genes: bionty.Gene .proteins: bionty.Protein .cell_markers: bionty.CellMarker .tissues: bionty.Tissue .cell_types: bionty.CellType .diseases: bionty.Disease .cell_lines: bionty.CellLine .phenotypes: bionty.Phenotype .pathways: bionty.Pathway .experimental_factors: bionty.ExperimentalFactor .developmental_stages: bionty.DevelopmentalStage .ethnicities: bionty.Ethnicity """).strip() assert description == ln.Artifact.describe(return_str=True) def test_repr_describe(): user = ln.User.filter().first() assert user.__repr__().startswith("User") assert user.describe(return_str=True).startswith("User") def test_record_describe_includes_features(): record = ln.Record(name="describe record").save() feature = ln.Feature(name="describe_metric", dtype=float).save() record.features.add_values({"describe_metric": 1.23}) output = record.describe(return_str=True) assert "Features" in output assert "describe_metric" in output assert "1.23" in output record.delete(permanent=True) feature.delete(permanent=True) def test_validate_literal_fields(): # validate literal with pytest.raises(FieldValidationError): ln.Transform(key="new-name-not-existing-123", kind="invalid") def test_init_with_args(): with pytest.raises( FieldValidationError, match=re.escape( "Use keyword arguments instead of positional arguments, e.g.: User(name='...')" ) + r".*", ): # can't use Record here because it raises "Only one non-keyword arg allowed" ln.User("an arg") def test_validate_required_fields(): # ULabel has a required name with pytest.raises(FieldValidationError): ln.ULabel() # ULabel has a required name with pytest.raises(FieldValidationError): ln.ULabel(description="test") @pytest.fixture def get_search_test_filepaths(): Path("unregistered_storage/").mkdir(exist_ok=True) filepaths = [Path(f"./unregistered_storage/test-search{i}.txt") for i in range(6)] for filepath in filepaths: filepath.write_text(filepath.name) yield None shutil.rmtree("unregistered_storage/") def test_search_and_get(get_search_test_filepaths): artifact1 = ln.Artifact( "./unregistered_storage/test-search1.txt", description="nonsense" ) artifact1.save() artifact2 = ln.Artifact( "./unregistered_storage/test-search2.txt", description="nonsense" ) artifact2.save() # on purpose to be search3 to test duplicated search artifact0 = ln.Artifact( "./unregistered_storage/test-search0.txt", description="test-search3" ) artifact0.save() artifact3 = ln.Artifact( "./unregistered_storage/test-search3.txt", description="test-search3" ) artifact3.save() artifact4 = ln.Artifact( "./unregistered_storage/test-search4.txt", description="test-search4" ) artifact4.save() result = ln.Artifact.search("search3").to_dataframe() assert result.iloc[0].description == "test-search3" assert result.iloc[1].description == "test-search3" # no returning entries if all search results have __ratio__ 0 # need a better search string below # assert ln.Artifact.search("x").shape[0] == 0 artifact5 = ln.Artifact( "./unregistered_storage/test-search5.txt", key="test-search5.txt" ) artifact5.save() res = ln.Artifact.search("search5").to_dataframe() assert res.iloc[0].key == "test-search5.txt" res_q = ln.Artifact.search("search5") assert res_q[0].key == "test-search5.txt" # queryset returns the same order of results assert res.uid.tolist() == [i.uid for i in res_q] # multi-field search res = ln.Artifact.search( "txt", field=["key", "description", "suffix"] ).to_dataframe() assert res.iloc[0].suffix == ".txt" # get artifact = ln.Artifact.get(description="test-search4") assert artifact == artifact4 with pytest.raises(ln.errors.ObjectDoesNotExist): ln.Artifact.get(description="test-does-not-exist") artifact0.delete(permanent=True, storage=True) artifact1.delete(permanent=True, storage=True) artifact2.delete(permanent=True, storage=True) artifact3.delete(permanent=True, storage=True) artifact4.delete(permanent=True, storage=True) artifact5.delete(permanent=True, storage=True) def test_suggest_similar_names(): record1 = ln.Record(name="Test experiment 1").save() record2 = ln.Record(name="Test experiment 2").save() record3 = ln.Record(name="Special test experiment abc").save() record4 = ln.Record(name="A very special test experiment abc").save() assert ln.Record(name="Test experiment 1").uid == record1.uid assert suggest_records_with_similar_names( record1, "name", {"name": "Test experiment 1"} ) assert not suggest_records_with_similar_names( record2, "name", {"name": "Test experiment 123"} ) queryset = _search( ln.Record, "Test experiment 123", field="name", truncate_string=True, limit=3, ) assert queryset.count() == 3 queryset = _search( ln.Record, "Special test experiment abc", field="name", truncate_string=True, limit=3, ) assert queryset.count() == 2 assert queryset[0].name == "Special test experiment abc" record1.delete(permanent=True) record2.delete(permanent=True) record3.delete(permanent=True) record4.delete(permanent=True) def test_pass_version(): # creating a new transform on key retrieves the same transform # for as long as no source_code was saved transform = ln.Transform(key="mytransform", version="1").save() assert transform.version_tag == "1" assert transform.version == "1" assert ln.Transform(key="mytransform", version="1") == transform # in case source code is saved transform.source_code = "dummy" transform.save() with pytest.raises(ValueError) as e: ln.Transform(key="mytransform", version="1") assert ( e.exconly() == "ValueError: Please change the version tag or leave it `None`, '1' is already taken" ) def test_delete(): record = ln.Record(name="test-delete") # record not yet saved, delete has no effect result = record.delete() assert result is None assert record.branch_id == 1 record.save() result = record.delete() assert result is None assert record.branch_id == -1 result = record.delete(permanent=True) assert isinstance(result, tuple) assert len(result) == 2 deleted_count, deleted_dict = result assert deleted_count == 1 assert isinstance(deleted_dict, dict) assert ln.Record.filter(name="test-delete").exists() is False def test_get_name_field(): transform = ln.Transform(key="test").save() assert get_name_field(ln.Run(transform)) == "started_at" with pytest.raises(ValueError): get_name_field(ln.Artifact.records.through()) transform.delete(permanent=True) def test_using(): # the two below calls error if the records aren't found ln.Artifact.connect("laminlabs/lamin-site-assets").get(1) ln.Artifact.connect("laminlabs/lamin-site-assets").get(uid="MqEaGU7fXvxNy61R0000") # cross-database query hemangioblast = bt.CellType.from_source(name="hemangioblast").save() artifact = ( ln.Artifact.connect("laminlabs/lamin-dev") .filter(cell_types=hemangioblast) .first() ) assert artifact is not None hemangioblast_dev = artifact.cell_types.get(name="hemangioblast") assert hemangioblast_dev.uid == hemangioblast.uid assert hemangioblast_dev.id != hemangioblast.id # query via list artifact_ref = ( ln.Artifact.connect("laminlabs/lamin-dev") .filter(cell_types__in=[hemangioblast]) .first() ) assert artifact == artifact_ref # check that .using provided with the current intance does nothing assert ln.User.connect("lamindb-unit-tests-core").first()._state.db == "default" user = ln.setup.settings.user.handle assert ( ln.User.connect(f"{user}/lamindb-unit-tests-core").first()._state.db == "default" ) def test_get_record_kwargs(): assert _get_record_kwargs(ln.Feature) == [ ("name", "str"), ("dtype", "DtypeStr | ULabel | Record | Registry | list[Registry] | FieldAttr"), ("type", "Feature | None"), ("is_type", "bool"), ("unit", "str | None"), ("description", "str | None"), ("synonyms", "str | None"), ("nullable", "bool | None"), ( "default_value", "Any | None", ), ("coerce", "bool | None"), ( "cat_filters", "dict[str", ), ] def test_get_record_kwargs_empty(): class EmptySQLRecord: pass assert _get_record_kwargs(EmptySQLRecord) == [] class NoInitSQLRecord: def method(self): pass assert _get_record_kwargs(NoInitSQLRecord) == [] def test_soft_delete_error(): with pytest.raises(ValueError): ln.Storage.filter().first().delete(permanent=False) with pytest.raises(ValueError): ln.Branch.filter().first().delete(permanent=False) def test_delete_return_value_permanent(): """Test that permanent delete returns Django's natural return value.""" # Test with ULabel (simple SQLRecord) ulabel = ln.ULabel(name="test-delete-return").save() result = ulabel.delete(permanent=True) assert isinstance(result, tuple) assert len(result) == 2 deleted_count, deleted_dict = result assert deleted_count == 1 assert isinstance(deleted_dict, dict) assert len(deleted_dict) > 0 # Check that the registry name is in the dict # Django returns app_label.ClassName format registry_name = f"{ulabel._meta.app_label}.{ulabel.__class__.__name__}" assert registry_name in deleted_dict assert deleted_dict[registry_name] == 1 def test_unsaved_relationship_modification_attempts(): af = ln.Artifact.from_dataframe( pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]}), description="testme" ) new_label = ln.Record(name="testlabel").save() with pytest.raises(ValueError) as excinfo: af.records.add(new_label) assert ( str(excinfo.value) == "You are trying to access the many-to-many relationships of an unsaved Artifact object. Please save it first using '.save()'." ) new_label.delete(permanent=True) af.delete(permanent=True) def test_failed_connect(): with pytest.raises(ln.setup.errors.InstanceNotFoundError) as error: ln.Artifact.connect("laminlabs/lamindata-not-existing") assert error.exconly().startswith( "lamindb_setup.errors.InstanceNotFoundError: 'laminlabs/lamindata-not-existing' not found: 'instance-not-found'" ) def test_unsaved_model_different_instance(): af = ln.Artifact.connect("laminlabs/lamindata").get( key="scrna/micro-macfarland2020.h5ad" ) new_label = ln.Record(name="testlabel").save() with pytest.raises(ValueError) as excinfo: af.records.add(new_label) assert ( str(excinfo.value) == "Cannot label a record from instance 'laminlabs/lamindata'. " "Please save the record first to your instance using '.save()'." ) new_label.delete(permanent=True) def test_track_fields_with_deferred_columns(example_dataframe: pd.DataFrame): artifact = ln.Artifact.from_dataframe( example_dataframe, key="deferred-track-fields.parquet" ).save() # loading a tracked field as deferred should not crash in __init__ deferred_artifact = ln.Artifact.filter(id=artifact.id).only("id").one() assert deferred_artifact.id == artifact.id assert not deferred_artifact._field_changed("space_id") artifact.delete(permanent=True) def test_track_fields_must_exist_on_model(monkeypatch, example_dataframe: pd.DataFrame): artifact = ln.Artifact.from_dataframe( example_dataframe, key="invalid-track-field.parquet" ).save() monkeypatch.setattr(ln.Artifact, "_TRACK_FIELDS", ("space_id", "not_a_real_field")) with pytest.raises( FieldValidationError, match="_TRACK_FIELDS contains invalid field for Artifact: not_a_real_field", ): ln.Artifact.get(artifact.id) artifact.delete(permanent=True) ================================================ FILE: tests/core/test_storage.py ================================================ import concurrent.futures import lamindb as ln # we need this test both in the core and the storage/cloud tests # because the internal logic that retrieves information about other instances # depends on whether the current instance is managed on the hub def test_reference_storage_location(ccaplog): ln.Artifact("s3://lamindata/iris_studies/study0_raw_images") assert ln.Storage.get(root="s3://lamindata").instance_uid == "4XIuR0tvaiXM" assert ( "referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM" in ccaplog.text ) def test_create_storage_locations_parallel(): root: str = "nonregistered_storage" def create_storage() -> str: ln.Storage(root=root).save() # type: ignore return root n_parallel = 3 with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor: futures = [executor.submit(create_storage) for i in range(n_parallel)] _ = [future.result() for future in concurrent.futures.as_completed(futures)] storage = ln.Storage.get(root__endswith=root) storage.delete() ================================================ FILE: tests/core/test_switch.py ================================================ """Tests for ln.setup.switch.""" import lamindb as ln import pytest def test_switch_create_existing_branch_raises(): """Switch with create=True and existing branch raises BranchAlreadyExists with hint.""" with pytest.raises(ln.errors.BranchAlreadyExists) as exc_info: ln.setup.switch("main", create=True) msg = str(exc_info.value) assert "already exists" in msg assert "-c/--create" in msg or "Omit" in msg ================================================ FILE: tests/core/test_track_flow.py ================================================ import time from pathlib import Path from typing import Iterable import lamindb as ln import pandas as pd import pytest from lamindb.errors import InvalidArgument @ln.flow(global_run="clear") def process_chunk( chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record] ) -> str: # Create a simple DataFrame df = pd.DataFrame( {"id": range(chunk_id * 10, (chunk_id + 1) * 10), "value": range(10)} ) env_file = Path("file_with_same_hash.txt") env_file.write_text("1") ln.Artifact(env_file, description="file_with_same_hash").save() # Save it as an artifact key = f"chunk_{chunk_id}.parquet" artifact = ln.Artifact.from_dataframe(df, key=key).save() assert ln.context.run is not None return artifact.key def test_flow(): param_artifact = ln.Artifact(".gitignore", key="param_artifact").save() ln.Record(name="record1").save(), ln.Record(name="record2").save() records_params = ln.Record.filter(name__startswith="record") assert ln.context.run is None artifact_key = process_chunk(1, param_artifact, records_params) assert ln.context.run is None # Verify the artifacts and runs artifacts = [ln.Artifact.get(key=key) for key in [artifact_key]] same_hash_artifacts = ln.Artifact.filter(description="file_with_same_hash") runs = [artifact.run for artifact in artifacts] # Verify each run has the correct start and finish times for run in runs: print(f"Run details: {run}") assert run.started_at is not None assert run.finished_at is not None assert run.started_at < run.finished_at assert run.status == "completed" assert isinstance(run.params["chunk_id"], int) assert run.params["artifact_param"].startswith( f"Artifact[{param_artifact.uid}]" ) assert run.params["records_params"] == [ f"Record[{record.uid}]" for record in records_params ] # test error behavior with pytest.raises(RuntimeError) as error: ln.context._run = run process_chunk(1, param_artifact, records_params) ln.context._run = None assert str(error.exconly()).startswith( "RuntimeError: Please use @ln.step() or clear the global run context before using @ln.flow(): no `ln.track()` or `@ln.flow(global_run='clear')`" ) # Clean up test artifacts runs = [] for artifact in artifacts: runs.append(artifact.run) artifact.delete(permanent=True) param_artifact.delete(permanent=True) same_hash_artifacts[0].delete(permanent=True) Path("file_with_same_hash.txt").unlink() for run in runs: run.delete(permanent=True) ln.context._run = None def test_flow_track_arg_aliases_implicit(): unique = time.time_ns() missing_project = f"missing-flow-project-{unique}" @ln.flow(global_run="clear") def flow_with_implicit_project_alias(project: str) -> None: pass with pytest.raises(InvalidArgument) as error: flow_with_implicit_project_alias(project=missing_project) assert error.exconly().startswith( f"lamindb.errors.InvalidArgument: Project '{missing_project}' not found" ) def test_flow_track_arg_aliases_false(): unique = time.time_ns() missing_project = f"missing-flow-project-{unique}" @ln.flow(global_run="clear", track_arg_aliases=False) def flow_without_project_alias(project: str) -> str: assert ln.context.run is not None return ln.context.run.uid run = None try: run_uid = flow_without_project_alias(project=missing_project) run = ln.Run.get(uid=run_uid) assert run.params["project"] == missing_project finally: ln.context._run = None if run is not None: run.delete(permanent=True) run.transform.delete(permanent=True) ================================================ FILE: tests/core/test_track_script_or_notebook.py ================================================ import signal import subprocess import sys import time from pathlib import Path from unittest.mock import MagicMock, patch import lamindb as ln import lamindb_setup as ln_setup import pytest from lamindb._finish import clean_r_notebook_html, get_shortcut from lamindb._secret_redaction import redact_secrets_in_source_code from lamindb.core._context import ( REDACTED_SECRET_VALUE, LogStreamTracker, context, detect_and_process_source_code_file, serialize_params_to_json, ) from lamindb.errors import InvalidArgument, TrackNotCalled, ValidationError from lamindb_setup.core.upath import UPath SCRIPTS_DIR = Path(__file__).parent.resolve() / "scripts" NOTEBOOKS_DIR = Path(__file__).parent.resolve() / "notebooks" def test_serialize_params_to_json(): a_path = Path("/some/local/folder") a_upath = UPath("s3://bucket/key") params = { "path_key": a_path, "none_key": None, "empty_list_key": [], "list_str_key": ["string"], "upath_key": a_upath, "str_key": "plain", "api_key": "test-api-key-value", "openAIApiKey": "another-secret", "database_url": "postgresql://db_user:db_password@db.example.com:5432/mydb", } result = serialize_params_to_json(params) # None is omitted assert "none_key" not in result # Empty list is omitted (same as None) assert "empty_list_key" not in result # Path is serialized to posix string assert result["path_key"] == "/some/local/folder" # UPath is serialized to posix string assert result["upath_key"] == "s3://bucket/key" # List of strings is JSON-serialized as-is (list[cat ? str]) assert result["list_str_key"] == ["string"] # Other values unchanged assert result["str_key"] == "plain" assert result["api_key"] == REDACTED_SECRET_VALUE assert result["openAIApiKey"] == REDACTED_SECRET_VALUE assert result["database_url"] == REDACTED_SECRET_VALUE assert set(result.keys()) == { "path_key", "upath_key", "str_key", "list_str_key", "api_key", "openAIApiKey", "database_url", } def test_redact_secrets_in_source_code(): source_code = """ api_key = "test-api-key-value" openAIApiKey = "another-secret" uid = "a6yhtobqTjQM6q8t" db_url = "postgresql://db_user:db_password@db.example.com:5432/mydb" os.environ["API_KEY"] = "sdk-key" config = {"client_secret": "client-secret-value", "id": "abc123"} """ redacted, redaction_count = redact_secrets_in_source_code(source_code) assert redaction_count == 5 assert 'api_key = "***REDACTED***"' in redacted assert 'openAIApiKey = "***REDACTED***"' in redacted assert 'db_url = "***REDACTED***"' in redacted assert 'os.environ["API_KEY"] = "***REDACTED***"' in redacted assert '"client_secret": "***REDACTED***"' in redacted assert 'uid = "a6yhtobqTjQM6q8t"' in redacted def test_redact_secrets_in_source_code_keeps_env_references(): source_code = """ api_key = os.getenv("OPENAI_API_KEY") openAIApiKey = getenv("OPENAI_API_KEY") model_api_key = os.environ["MODEL_API_KEY"] provider_token = os.environ.get("PROVIDER_TOKEN") """ redacted, redaction_count = redact_secrets_in_source_code(source_code) # Env lookups are references, not embedded literals. Keep them for rerunnable source code. assert redaction_count == 0 assert 'api_key = os.getenv("OPENAI_API_KEY")' in redacted assert 'openAIApiKey = getenv("OPENAI_API_KEY")' in redacted assert 'model_api_key = os.environ["MODEL_API_KEY"]' in redacted assert 'provider_token = os.environ.get("PROVIDER_TOKEN")' in redacted def test_redact_secrets_in_source_code_ignores_annotations_and_forwarding(): source_code = """ def run(api_key: str) -> None: raise RuntimeError("fail") run_agent( api_key=api_key, ) """ redacted, redaction_count = redact_secrets_in_source_code(source_code) # Do not treat Python type annotations or argument forwarding as hardcoded secrets. assert redaction_count == 0 assert "def run(api_key: str) -> None:" in redacted assert "api_key=api_key," in redacted def test_serialize_params_to_json_redacts_provider_api_key_names(): params = { "LAMIN_API_KEY": "lamin-super-secret", "OPENAI_API_KEY": "openai-super-secret", "ANTHROPIC_API_KEY": "anthropic-super-secret", "GEMINI_API_KEY": "gemini-super-secret", "provider_name": "safe-value", } result = serialize_params_to_json(params) assert result["LAMIN_API_KEY"] == REDACTED_SECRET_VALUE assert result["OPENAI_API_KEY"] == REDACTED_SECRET_VALUE assert result["ANTHROPIC_API_KEY"] == REDACTED_SECRET_VALUE assert result["GEMINI_API_KEY"] == REDACTED_SECRET_VALUE assert result["provider_name"] == "safe-value" def test_redact_secrets_in_source_code_redacts_provider_api_key_names(): source_code = """ LAMIN_API_KEY = "lamin-super-secret" OPENAI_API_KEY = "openai-super-secret" ANTHROPIC_API_KEY = "anthropic-super-secret" GEMINI_API_KEY = "gemini-super-secret" provider = "openai" """ redacted, redaction_count = redact_secrets_in_source_code(source_code) assert redaction_count == 4 assert 'LAMIN_API_KEY = "***REDACTED***"' in redacted assert 'OPENAI_API_KEY = "***REDACTED***"' in redacted assert 'ANTHROPIC_API_KEY = "***REDACTED***"' in redacted assert 'GEMINI_API_KEY = "***REDACTED***"' in redacted assert 'provider = "openai"' in redacted def test_track_basic_invocation(): project = "non-existing project" with pytest.raises(ln.errors.InvalidArgument) as error: ln.track(project=project) assert ( error.exconly() == f"lamindb.errors.InvalidArgument: Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos." ) space = "non-existing space" with pytest.raises(ln.errors.InvalidArgument) as error: ln.track(space=space) assert ( error.exconly() == f"lamindb.errors.InvalidArgument: Space '{space}', please check on the hub UI whether you have the correct `uid` or `name`." ) test_transform = ln.Transform(key="test_transform").save() # first invocation using features kwargs = {"param1": 1, "param2": "my-string", "param3": 3.14} with pytest.raises(ValidationError) as exc: ln.track(transform=test_transform, features=kwargs) assert exc.exconly().startswith( """lamindb.errors.ValidationError: These keys could not be validated: ['param1', 'param2', 'param3']""" ) feature1 = ln.Feature(name="param1", dtype=int).save() feature2 = ln.Feature(name="param2", dtype=str).save() feature3 = ln.Feature(name="param3", dtype=float).save() feature4 = ln.Feature(name="label_param", dtype=ln.Record).save() record = ln.Record(name="my_label").save() kwargs["label_param"] = "my_label" ln.track(transform=test_transform, features=kwargs) assert ln.context.run.features.get_values() == kwargs print(ln.context.run.features.describe(return_str=True)) assert ( ln.context.run.features.describe(return_str=True) == f"""\ Run: {ln.context.run.uid[:7]} ({ln.context.run.transform.key}) └── Features └── label_param Record my_label param1 int 1 param2 str my-string param3 float 3.14""" ) # also call describe() plainly without further checks ln.context.run.describe() # second invocation kwargs = {"param1": 1, "param2": "my-string", "param3": 3.14, "param4": [1, 2]} param4 = ln.Feature(name="param4", dtype="int").save() with pytest.raises(ValidationError) as exc: ln.track(transform=test_transform, features=kwargs) assert "Column 'param4' failed dtype check for 'int': got object" in exc.exconly() # fix param4 dtype param4.delete(permanent=True) param4 = ln.Feature(name="param4", dtype=list[int]).save() # re-run ln.track(transform=test_transform, features=kwargs) assert ln.context.run.features.get_values() == kwargs # now use the params arg ln.track(transform=test_transform, params=kwargs) assert ln.context.run.params == kwargs assert ln.Run.filter(params__param1=kwargs["param1"]).count() == 1 # test that run populates things like records record = ln.Record(name="my-label-in-track") assert record.run == ln.context.run # test that we can call ln.finish() also for pipeline-like transforms run = ln.context.run assert run.finished_at is None ln.finish() assert ( run.finished_at is not None ) # context is cleared after finish(); use captured run # clean up run.delete(permanent=True) ln.models.RunJsonValue.filter(run__transform=test_transform).delete(permanent=True) ln.models.RunRecord.filter(run__transform=test_transform).delete(permanent=True) feature1.delete(permanent=True) feature2.delete(permanent=True) feature3.delete(permanent=True) feature4.delete(permanent=True) param4.delete(permanent=True) test_transform.delete(permanent=True) def test_track_accepts_initiated_by_run_uid(): unique = time.time_ns() parent_transform = ln.Transform(key=f"parent-run-{unique}").save() child_transform = ln.Transform(key=f"child-run-{unique}").save() parent_run = ln.Run(transform=parent_transform).save() try: ln.track( transform=child_transform, initiated_by_run=parent_run.uid, new_run=True, ) assert ln.context.run is not None assert ln.context.run.initiated_by_run is not None assert ln.context.run.initiated_by_run.uid == parent_run.uid ln.finish() with pytest.raises(InvalidArgument) as error: ln.track( transform=child_transform, initiated_by_run="does-not-exist", new_run=True, ) assert error.exconly().startswith( "lamindb.errors.InvalidArgument: Run 'does-not-exist' not found" ) finally: ln.context._run = None ln.Run.filter(transform=child_transform).delete(permanent=True) parent_run.delete(permanent=True) child_transform.delete(permanent=True) parent_transform.delete(permanent=True) def test_track_uses_initiated_by_run_uid_from_env(monkeypatch: pytest.MonkeyPatch): unique = time.time_ns() parent_transform = ln.Transform(key=f"parent-run-env-{unique}").save() child_transform = ln.Transform(key=f"child-run-env-{unique}").save() parent_run = ln.Run(transform=parent_transform).save() try: monkeypatch.setenv("LAMIN_INITIATED_BY_RUN_UID", parent_run.uid) ln.track(transform=child_transform, new_run=True) assert ln.context.run is not None assert ln.context.run.initiated_by_run is not None assert ln.context.run.initiated_by_run.uid == parent_run.uid ln.finish() finally: ln.context._run = None ln.Run.filter(transform=child_transform).delete(permanent=True) parent_run.delete(permanent=True) child_transform.delete(permanent=True) parent_transform.delete(permanent=True) @pytest.mark.parametrize("pass_plan_as_key", [False, True], ids=["artifact", "key"]) def test_track_with_plan_links_run(tmp_path, pass_plan_as_key): unique = time.time_ns() plan_path = tmp_path / f"my-agent-plan-{unique}.md" plan_path.write_text("# Agent plan\n\n- Step 1\n") plan_artifact = ln.Artifact( plan_path, key=f".plans/my-agent-plan-{unique}.md", kind="plan", ).save() transform = ln.Transform(key=f"test-track-with-plan-{unique}").save() try: plan = plan_artifact.key if pass_plan_as_key else plan_artifact ln.track(transform=transform, plan=plan) run = ln.context.run assert run.plan is not None assert run.plan.uid == plan_artifact.uid run_from_db = ln.Run.get(uid=run.uid) assert run_from_db.plan is not None assert run_from_db.plan.uid == plan_artifact.uid ln.finish() finally: ln.context._run = None ln.Run.filter(transform=transform).delete(permanent=True) plan_artifact.delete(permanent=True) transform.delete(permanent=True) @pytest.fixture def create_record(): """Factory fixture that returns a function to create records.""" created_records = [] def create(kind: str) -> ln.models.SQLRecord: if kind == "artifact": record = ln.Artifact("README.md", key="README.md").save() elif kind == "collection": a1 = ln.Artifact("README.md", key="README.md").save() created_records.append(a1) a2 = ln.Artifact("pyproject.toml", key="pyproject.toml").save() created_records.append(a2) record = ln.Collection([a1, a2], key="test-collection").save() created_records.append(record) return record yield create for record in created_records[::-1]: record.delete(permanent=True) @pytest.mark.parametrize("kind", ["artifact", "collection"]) def test_track_input_record(create_record, kind): # First run ln.track() previous_run = ln.context.run record = create_record(kind) record.cache() assert ( record not in getattr(ln.context.run, f"input_{kind}s").all() ) # avoid cycle with created artifact # Second run ln.track(new_run=True) assert ln.context.run != previous_run record = create_record(kind) assert ln.context.run in record.recreating_runs.all() assert record._subsequent_run_id == ln.context.run.id record.cache() assert ( record not in getattr(ln.context.run, f"input_{kind}s").all() ) # avoid cycle with re-created artifact # Third run ln.track(new_run=True) assert ln.context.run != previous_run if kind == "artifact": record = ln.Artifact.get(key="README.md") else: record = ln.Collection.get(key="test-collection") record.cache() assert ln.context.run not in record.recreating_runs.all() assert not hasattr(record, "_subsequent_run_id") assert record in getattr(ln.context.run, f"input_{kind}s").all() # regular input def test_track_notebook_colab(): notebook_path = "/fileId=1KskciVXleoTeS_OGoJasXZJreDU9La_l" ln.context._track_notebook(path_str=notebook_path) def test_track_notebook_untitled(): notebook_path = "Untitled.ipynb" with pytest.raises(RuntimeError) as error: ln.context._track_notebook(path_str=notebook_path) assert ( "Your notebook file name is 'Untitled.ipynb', please rename it before tracking. You might have to re-start your notebook kernel." in error.exconly() ) def test_detect_and_process_source_code_file_returns_key_from_module_for_package(): """When path is inferred from stack and caller __name__ has '.', key_from_module is module path.""" script_path = str(SCRIPTS_DIR / "script-to-test-versioning.py") mock_frame = MagicMock() mock_frame.f_globals = {"__name__": "mypackage.mymodule"} with patch("inspect.stack") as mock_stack: mock_stack.return_value = [ MagicMock(), MagicMock(), ( mock_frame, script_path, MagicMock(), MagicMock(), MagicMock(), MagicMock(), ), ] path, kind, ref, ref_type, key_from_module = ( detect_and_process_source_code_file(path=None) ) assert key_from_module == "pypackages/mypackage/mymodule.py" assert path == Path(script_path) def test_detect_and_process_source_code_file_returns_none_key_for_script(): """When path is inferred from stack and caller __name__ has no '.', key_from_module is None.""" script_path = str(SCRIPTS_DIR / "script-to-test-versioning.py") mock_frame = MagicMock() mock_frame.f_globals = {"__name__": "__main__"} with patch("inspect.stack") as mock_stack: mock_stack.return_value = [ MagicMock(), MagicMock(), ( mock_frame, script_path, MagicMock(), MagicMock(), MagicMock(), MagicMock(), ), ] path, kind, ref, ref_type, key_from_module = ( detect_and_process_source_code_file(path=None) ) assert key_from_module is None def test_finish_before_track(): ln.context._run = None with pytest.raises(TrackNotCalled) as error: ln.finish() assert "Please run `ln.track()` before `ln.finish()" in error.exconly() def test_invalid_transform_kind(): transform = ln.Transform(key="test transform") ln.track(transform=transform) ln.context._path = None ln.context.run.transform.kind = "script" with pytest.raises(ValueError) as error: ln.finish() assert "Transform type is not allowed to be" in error.exconly() # unset to remove side effects ln.context._run = None def test_create_or_load_transform(): title = "title" version = "2.0" uid = "NJvdsWWbJlZS0000" context.uid = uid context.version = version context._path = Path("my-test-transform-create-or-load.py") context._path.touch(exist_ok=True) context._create_or_load_transform( description=title, transform_kind="notebook", ) assert context._transform.uid == uid assert context._transform.version_tag == version assert context._transform.description == title context._create_or_load_transform( description=title, ) assert context._transform.uid == uid assert context._transform.version_tag == version assert context._transform.description == title # now, test an updated transform name context._create_or_load_transform( description="updated title", ) assert context._transform.uid == uid assert context._transform.version_tag == version assert context._transform.description == "updated title" # unset to remove side effects ln.context._uid = None ln.context._run = None ln.context._transform = None ln.context._path.unlink() ln.context._path = None def test_create_or_load_transform_warns_when_outside_dev_dir( tmp_path, ccaplog: pytest.LogCaptureFixture ): previous_dev_dir = ln_setup.settings.dev_dir path_outside_dev_dir = tmp_path / f"outside-{time.time_ns()}.py" path_outside_dev_dir.write_text("print('track test')\n") expected_key = path_outside_dev_dir.name transform: ln.Transform | None = None try: ln_setup.settings.dev_dir = tmp_path / "configured-dev-dir" ln_setup.settings.dev_dir.mkdir(exist_ok=True) ccaplog.clear() context._path = path_outside_dev_dir context._create_or_load_transform(description="outside dev dir warning test") transform = context._transform assert "falling back to using filename as transform key" in ccaplog.text assert transform.key == expected_key finally: ln_setup.settings.dev_dir = previous_dev_dir ln.context._uid = None ln.context._run = None ln.context._transform = None ln.context._path = None if transform is not None: transform.delete(permanent=True) def test_run_scripts(): # regular execution result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'script-to-test-versioning.py --param 42'}", shell=True, capture_output=True, ) assert result.returncode == 0 assert "created Transform('Ro1gl7n8YrdH0000'" in result.stdout.decode() assert "started new Run(" in result.stdout.decode() transform = ln.Transform.get("Ro1gl7n8YrdH0000") assert transform.latest_run.cli_args == "--param 42" # updated key (filename change) result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'script-to-test-filename-change.py'}", shell=True, capture_output=True, ) assert result.returncode == 0 assert "renaming transform" in result.stdout.decode() transform = ln.Transform.get(key="script-to-test-filename-change.py") assert transform.latest_run.cli_args is None # version already taken result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'duplicate1/script-to-test-versioning.py'}", shell=True, capture_output=True, ) assert result.returncode == 1 assert ( "✗ version '1' is already taken by Transform('Ro1gl7n8YrdH0000'); please set another version, e.g., ln.context.version = '1.1'" in result.stderr.decode() ) # regular version bump result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'duplicate2/script-to-test-versioning.py'}", shell=True, capture_output=True, ) assert result.returncode == 0 assert "created Transform('Ro1gl7n8YrdH0002'" in result.stdout.decode() assert "started new Run(" in result.stdout.decode() assert not ln.Transform.get("Ro1gl7n8YrdH0001").is_latest assert ln.Transform.get("Ro1gl7n8YrdH0002").is_latest # inconsistent version result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'duplicate3/script-to-test-versioning.py'}", shell=True, capture_output=True, ) assert result.returncode == 1 assert ( "Transform is already tagged with version 2, but you passed 3" in result.stderr.decode() ) # multiple folders, do not match the key because of the folder structure ln.Transform.filter(key__endswith="script-to-test-versioning.py").update( key="teamA/script-to-test-versioning.py" ) # this test creates a transform with key script-to-test-versioning.py at the root level result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'duplicate4/script-to-test-versioning.py'}", shell=True, capture_output=True, ) assert result.returncode == 0 assert "ignoring transform" in result.stdout.decode() transform = ln.Transform.get(key="script-to-test-versioning.py") # multiple folders, match the key, also test is finished result = subprocess.run( # noqa: S602 f"python {SCRIPTS_DIR / 'duplicate5/script-to-test-versioning.py'}", shell=True, capture_output=True, ) assert result.returncode == 0 assert f"{transform.stem_uid}" in result.stdout.decode() assert "making new version" in result.stdout.decode() transform = ln.Transform.get(key="script-to-test-versioning.py") assert transform.latest_run.finished_at is not None def test_run_external_script(): script_path = "sub/lamin-cli/tests/scripts/run-track-and-finish-sync-git.py" result = subprocess.run( # noqa: S602 f"python {script_path}", shell=True, capture_output=True, ) print(result.stdout.decode()) print(result.stderr.decode()) assert result.returncode == 0 assert "created Transform" in result.stdout.decode() assert "started new Run" in result.stdout.decode() transform = ln.Transform.get(key="run-track-and-finish-sync-git.py") # the algorithm currently picks different commits depending on the state of the repo # any of these commits are valid assert transform.uid == "m5uCHTTpJnjQ0000" assert transform.reference.endswith( "/tests/scripts/run-track-and-finish-sync-git.py" ) assert transform.reference.startswith( "https://github.com/laminlabs/lamin-cli/blob/" ) assert transform.reference_type == "url" assert transform.description == "My good script" # ensure that the source code is not saved as an output artifact assert transform.latest_run.output_artifacts.count() == 0 assert transform.runs.count() == 1 assert transform.hash == "VC1oTPcaVSrzNrXUT9p4qw" @pytest.mark.parametrize("type", ["notebook", "script"]) def test_track_notebook_or_script_manually(type): transform = ln.Transform(key="My notebook", kind=type) with pytest.raises(ValueError) as error: ln.track(transform=transform) assert ( error.exconly() == "ValueError: Use `ln.track()` without passing transform in a notebook or script - metadata is automatically parsed" ) def test_clean_r_notebook_html(): orig_notebook_path = NOTEBOOKS_DIR / "basic-r-notebook.Rmd.html" content = orig_notebook_path.read_text() orig_notebook_path.write_text(content.replace("SHORTCUT", get_shortcut())) comparison_path = NOTEBOOKS_DIR / "basic-r-notebook.Rmd.cleaned.html" compare = comparison_path.read_text() comparison_path.unlink() title_text, cleaned_path = clean_r_notebook_html(orig_notebook_path) assert comparison_path == cleaned_path assert title_text == "My exemplary R analysis" assert compare == cleaned_path.read_text() # check that things have been stripped comparison_path.write_text(compare) orig_notebook_path.write_text(content.replace(get_shortcut(), "SHORTCUT")) def test_notebook_to_script_notebooknode_metadata(tmp_path): """Test that notebook_to_script handles NotebookNode metadata. https://github.com/laminlabs/lamindb/issues/3480 """ import nbformat from lamindb._finish import notebook_to_script nb = nbformat.v4.new_notebook() nb.metadata["kernelspec"] = nbformat.NotebookNode({"display_name": "python3"}) notebook_path = tmp_path / "test.ipynb" nbformat.write(nb, notebook_path) # This would raise RepresenterError without metadata.clear() result = notebook_to_script("Test", notebook_path) assert result is not None assert "NotebookNode" not in result class MockRun: def __init__(self, uid): self.uid = uid self.report = None self.saved = False def save(self): self.saved = True def test_logstream_tracker_multiple(): tracker1 = LogStreamTracker() tracker2 = LogStreamTracker() tracker3 = LogStreamTracker() try: # Start trackers one by one and print messages print("Initial stdout") tracker1.start(MockRun("run1")) print("After starting tracker1") tracker2.start(MockRun("run2")) print("After starting tracker2") tracker3.start(MockRun("run3")) print("After starting tracker3") print("Testing stderr", file=sys.stderr) time.sleep(0.1) # Clean up in reverse order tracker3.finish() tracker2.finish() tracker1.finish() # Verify log contents - each log should only contain messages after its start expected_contents = { 1: [ "After starting tracker1", "After starting tracker2", "After starting tracker3", "Testing stderr", ], 2: ["After starting tracker2", "After starting tracker3", "Testing stderr"], 3: ["After starting tracker3", "Testing stderr"], } for i in range(1, 4): log_path = Path(ln_setup.settings.cache_dir / f"run_logs_run{i}.txt") with open(log_path) as f: content = f.read() print(f"\nContents of run{i} log:") print(content) # Check each expected line is in the content for expected_line in expected_contents[i]: assert expected_line in content, ( f"Expected '{expected_line}' in log {i}" ) # Check earlier messages are NOT in the content if i > 1: assert "Initial stdout" not in content assert "After starting tracker" + str(i - 1) not in content finally: # Cleanup for i in range(1, 4): log_path = Path(ln_setup.settings.cache_dir / f"run_logs_run{i}.txt") if log_path.exists(): log_path.unlink() def test_logstream_tracker_exception_handling(): tracker = LogStreamTracker() original_excepthook = sys.excepthook run = MockRun("error") try: tracker.start(run) print("Before error") # Create and capture exception info exc_type = ValueError exc_value = ValueError("Test error") exc_traceback = None try: raise exc_value except ValueError: exc_traceback = sys.exc_info()[2] # Handle the exception - this will trigger cleanup tracker.handle_exception(exc_type, exc_value, exc_traceback) # Verify run status assert run.saved assert run.report is not None # Verify the content was written before cleanup content = run.report.cache().read_text() print("Log contents:", content) assert "Before error" in content assert "ValueError: Test error" in content assert "Traceback" in content finally: tracker.finish() sys.excepthook = original_excepthook log_path = Path(ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt") if log_path.exists(): log_path.unlink() def test_logstream_tracker_cleanup_sigint_chains_to_keyboard_interrupt(): tracker = LogStreamTracker() run = MockRun("sigint") original_excepthook = sys.excepthook def raising_sigint_handler(signum, frame): raise KeyboardInterrupt try: with ( patch( "signal.getsignal", side_effect=[signal.SIG_DFL, raising_sigint_handler], ), patch("signal.signal"), patch("lamindb._finish.save_run_logs"), ): tracker.start(run) with pytest.raises(KeyboardInterrupt): tracker.cleanup(signo=signal.SIGINT, frame=None) finally: tracker.finish() sys.excepthook = original_excepthook log_path = Path(ln_setup.settings.cache_dir / f"run_logs_{run.uid}.txt") if log_path.exists(): log_path.unlink() ================================================ FILE: tests/core/test_track_step.py ================================================ import concurrent.futures from pathlib import Path from typing import Iterable import lamindb as ln import pandas as pd import pytest @ln.step() def process_chunk( chunk_id: int, artifact_param: ln.Artifact, records_params: Iterable[ln.Record] ) -> str: # Create a simple DataFrame df = pd.DataFrame( {"id": range(chunk_id * 10, (chunk_id + 1) * 10), "value": range(10)} ) env_file = Path("file_with_same_hash.txt") env_file.write_text("1") ln.Artifact(env_file, description="file_with_same_hash").save() # Save it as an artifact key = f"chunk_{chunk_id}.parquet" artifact = ln.Artifact.from_dataframe(df, key=key).save() return artifact.key def test_step_parallel(): # Ensure no global run from a previous test (e.g. test_flow) ln.context._run = None with pytest.raises(RuntimeError) as err: process_chunk(4) assert ( err.exconly() == "RuntimeError: Please track the global run context before using @ln.step(): ln.track() or @ln.flow()" ) # Ensure tracking is on ln.track() # Number of parallel executions n_parallel = 3 param_artifact = ln.Artifact(".gitignore", key="param_artifact").save() ln.Record(name="record1").save(), ln.Record(name="record2").save() records_params = ln.Record.filter(name__startswith="record") # Use ThreadPoolExecutor for parallel execution with concurrent.futures.ThreadPoolExecutor(max_workers=n_parallel) as executor: # Submit all tasks futures = [ executor.submit(process_chunk, i, param_artifact, records_params) for i in range(n_parallel) ] # Get results as they complete chunk_keys = [ future.result() for future in concurrent.futures.as_completed(futures) ] # Verify results # Each execution should have created its own artifact with unique run print(f"Created artifacts with keys: {chunk_keys}") artifacts = [ln.Artifact.get(key=key) for key in chunk_keys] same_hash_artifacts = ln.Artifact.filter(description="file_with_same_hash") # Check that we got the expected number of artifacts assert len(artifacts) == n_parallel assert ( len(same_hash_artifacts) == 1 ) # only one artifact with the same hash should exist # Verify each artifact has its own unique run runs = [artifact.run for artifact in artifacts] run_ids = [run.id for run in runs] print(f"Run IDs: {run_ids}") assert len(set(run_ids)) == n_parallel # all runs should be unique # Verify each run has the correct start and finish times for run in runs: print(f"Run details: {run}") assert run.started_at is not None assert run.finished_at is not None assert run.started_at < run.finished_at assert run.status == "completed" assert isinstance(run.params["chunk_id"], int) assert run.params["artifact_param"].startswith( f"Artifact[{param_artifact.uid}]" ) assert run.params["records_params"] == [ f"Record[{record.uid}]" for record in records_params ] # Clean up test artifacts runs = [] for artifact in artifacts: runs.append(artifact.run) artifact.delete(permanent=True) param_artifact.delete(permanent=True) same_hash_artifacts[0].delete(permanent=True) Path("file_with_same_hash.txt").unlink() for run in runs: run.delete(permanent=True) ln.context._uid = None ln.context._run = None ln.context._transform = None ln.context._path = None ================================================ FILE: tests/core/test_transform.py ================================================ from pathlib import Path from unittest.mock import patch import lamindb as ln import pytest def test_transform_recovery_based_on_hash(): transform1 = ln.Transform(key="my-transform", source_code="1").save() transform2 = ln.Transform(key="my-transform", source_code="1") assert transform1 == transform2 transform1.delete() transform2 = ln.Transform(key="my-transform", source_code="1") assert transform1 != transform2 transform1.delete(permanent=True) def test_transform_recovery_based_on_key(): transform1 = ln.Transform(key="my-transform").save() transform2 = ln.Transform(key="my-transform") assert transform1 == transform2 transform1.delete() transform2 = ln.Transform(key="my-transform") assert transform1 != transform2 transform1.delete(permanent=True) def test_revise_transforms(): # attempt to create a transform with an invalid version with pytest.raises(ValueError) as error: transform = ln.Transform(key="My transform", version=0) assert ( error.exconly() == "ValueError: `version` parameter must be `None` or `str`, e.g., '0.1', '1'," " '2', etc." ) # create a versioned transform transform = ln.Transform(key="My transform", version="1") assert transform.version_tag == "1" assert transform.version == "1" assert len(transform.uid) == ln.Transform._len_full_uid == 16 assert len(transform.stem_uid) == ln.Transform._len_stem_uid == 12 transform.save() # try to reload the same transform with the same uid transform_reload = ln.Transform(uid=transform.uid, key="My transform updated name") assert transform_reload.id == transform.id assert transform_reload.key == "My transform" # unchanged, prints logging transform_reload = ln.Transform( uid=transform.uid, description="My transform updated name" ) assert transform_reload.id == transform.id assert ( transform_reload.description == "My transform updated name" ) # unchanged, prints logging # create new transform from old transform transform_r2 = ln.Transform(description="My 2nd transform", revises=transform) assert transform_r2.uid != transform.uid assert transform_r2.uid.endswith("0001") transform_r2 = ln.Transform(description="My 2nd transform", revises=transform) assert transform_r2.uid != transform.uid assert transform_r2.uid.endswith("0001") assert transform_r2.stem_uid == transform.stem_uid assert transform_r2.version_tag is None assert ( transform_r2.version == transform_r2.uid[-4:] ) # version falls back to uid suffix assert transform_r2.is_latest assert transform.is_latest transform_r2.save() assert not transform.is_latest # create new transform from newly versioned transform transform_r3 = ln.Transform( description="My transform", revises=transform_r2, version="2" ) assert transform_r3.stem_uid == transform.stem_uid assert transform_r3.version_tag == "2" assert transform_r3.version == "2" # default description transform_r3 = ln.Transform(revises=transform_r2) assert transform_r3.description == transform_r2.description # revise by matching on `key` key = "my-notebook.ipynb" transform_r2.key = key transform_r2.save() assert transform_r2.is_latest transform_r3 = ln.Transform(description="My transform", key=key, version="2") assert transform_r3.uid[:-4] == transform_r2.uid[:-4] assert transform_r3.uid.endswith("0001") # this only fires if source code was actually saved transform_r2.source_code = "something" transform_r2.save() transform_r3 = ln.Transform(description="My transform", key=key, version="2") assert transform_r3.uid[:-4] == transform_r2.uid[:-4] assert transform_r3.uid.endswith("0002") assert transform_r3.stem_uid == transform_r2.stem_uid assert transform_r3.key == key assert transform_r3.version_tag == "2" assert transform_r3.version == "2" assert transform_r3.is_latest # because the new transform isn't yet saved, the old transform still has # is_latest = True assert transform_r2.is_latest assert transform_r3._revises is not None transform_r3.save() # now r2 is no longer the latest version, but need to re-fresh from db transform_r2 = ln.Transform.get(transform_r2.uid) assert not transform_r2.is_latest # wrong transform type with pytest.raises(TypeError) as error: ln.Transform(revises=ln.Record(name="x")) assert error.exconly().startswith( "TypeError: `revises` has to be of type `Transform`" ) # wrong kwargs with pytest.raises(ValueError) as error: ln.Transform(x=1) assert ( error.exconly() == "ValueError: Only key, description, version_tag, type, revises," " reference, reference_type can be passed, but you passed: {'x': 1}" ) # test that reference transform cannot be deleted transform_r2.delete() transform.delete() # unversioned transform transform = ln.Transform(key="My transform") assert transform.version_tag is None assert transform.version == transform.uid[-4:] # version falls back to uid suffix # what happens if we don't save the old transform? # add a test for it! transform.save() # create new transform from old transform new_transform = ln.Transform(description="My new transform", revises=transform) assert transform.version_tag is None assert transform.version == transform.uid[-4:] # version falls back to uid suffix assert new_transform.stem_uid == transform.stem_uid assert new_transform.uid.endswith("0001") assert new_transform.version_tag is None assert ( new_transform.version == new_transform.uid[-4:] ) # version falls back to uid suffix transform.delete(permanent=True) def test_delete(): # prepare the creation of a transform with its artifacts transform = ln.Transform(key="My transform").save() run = ln.Run(transform) report_path = Path("report.html") with open(report_path, "w") as f: f.write("a") environment_path = Path("environment.txt") with open(environment_path, "w") as f: f.write("c") report = ln.Artifact(report_path, description=f"Report of {run.uid}").save() report_path.unlink() report_path = report.path environment = ln.Artifact(environment_path, description="requirements.txt").save() environment_path.unlink() environment_path = environment.path transform.save() run.report = report run.environment = environment run.save() assert report_path.exists() assert environment_path.exists() # now delete everything (run artifacts are cleaned up in background subprocess) transform.delete(permanent=True) assert len(ln.Run.filter(id=run.id)) == 0 # Clean up orphan report/env artifacts if subprocess has not run yet for art in [report, environment]: a = ln.Artifact.filter(id=art.id).first() if a is not None: a.delete(permanent=True, storage=True) assert not report_path.exists() assert not environment_path.exists() assert len(ln.Artifact.filter(id__in=[report.id, environment.id])) == 0 # see test_composite_component in test_schema.py def test_successor_predecessor(): predecessor = ln.Transform(key="predecessor").save() successor1 = ln.Transform(key="successor1").save() successor2 = ln.Transform(key="successor2").save() predecessor.successors.add( successor1, successor2, through_defaults={"config": {"param": 42}} ) assert len(predecessor.successors.all()) == 2 assert predecessor.links_successor.count() == 2 assert predecessor.links_successor.first().config == {"param": 42} assert predecessor.links_successor.first().predecessor == predecessor assert predecessor.predecessors.count() == 0 assert predecessor.links_predecessor.count() == 0 ln.models.transform.TransformTransform.filter(predecessor=predecessor).delete( permanent=True ) link = ln.models.transform.TransformTransform( predecessor=predecessor, successor=successor1, config={"param": 42} ).save() assert link in predecessor.links_successor.all() assert link in successor1.links_predecessor.all() assert link.config == {"param": 42} predecessor.delete(permanent=True) successor1.delete(permanent=True) successor2.delete(permanent=True) assert ln.models.transform.TransformTransform.filter().count() == 0 def test_bulk_transform_permanent_delete(tmp_path): """Bulk Transform permanent delete deletes TransformProject, runs (and artifacts), then transforms.""" transform = ln.Transform(key="Bulk transform delete").save() runs = [ln.Run(transform).save() for _ in range(2)] report_files = [tmp_path / f"bulk_report_{i}.txt" for i in range(2)] for f in report_files: f.write_text("report content") report_artifacts = [ ln.Artifact(str(f), description=f"report {i}").save() for i, f in enumerate(report_files) ] for run, art in zip(runs, report_artifacts): run.report = art run.save() transform_id = transform.id run_ids = [r.id for r in runs] artifact_ids = [r.report_id for r in runs] with patch("lamindb.models.run.subprocess.Popen") as mock_popen: ln.Transform.filter(id=transform_id).delete(permanent=True) mock_popen.assert_called_once() args = mock_popen.call_args[0][0] ids_str = args[args.index("--ids") + 1] assert {int(x) for x in ids_str.split(",")} == set(artifact_ids) assert ln.Transform.filter(id=transform_id).count() == 0 for rid in run_ids: assert ln.Run.filter(id=rid).count() == 0 # With mock, cleanup subprocess did not run; clean up orphan report artifacts for aid in artifact_ids: art = ln.Artifact.filter(id=aid).first() if art is not None: art.delete(permanent=True, storage=False) def test_single_transform_permanent_delete_delegates_to_queryset(tmp_path): """Single Transform permanent delete delegates to QuerySet and removes runs and artifacts.""" transform = ln.Transform(key="Single transform delete").save() run = ln.Run(transform).save() report_file = tmp_path / "single_report.txt" report_file.write_text("report") report = ln.Artifact(str(report_file), description="report").save() run.report = report run.save() transform_id = transform.id run_id = run.id artifact_id = report.id with patch("lamindb.models.run.subprocess.Popen") as mock_popen: transform.delete(permanent=True) mock_popen.assert_called_once() args = mock_popen.call_args[0][0] ids_str = args[args.index("--ids") + 1] assert artifact_id in {int(x) for x in ids_str.split(",")} assert ln.Transform.filter(id=transform_id).count() == 0 assert ln.Run.filter(id=run_id).count() == 0 # With mock, cleanup subprocess did not run; clean up orphan report artifact art = ln.Artifact.filter(id=artifact_id).first() if art is not None: art.delete(permanent=True, storage=False) def test_bulk_transform_soft_delete(): """Bulk Transform soft delete sets branch_id=-1.""" transform = ln.Transform(key="Bulk transform soft delete").save() ln.Run(transform).save() transform_id = transform.id ln.Transform.filter(id=transform_id).delete(permanent=False) t = ln.Transform.filter(id=transform_id).one() assert t.branch_id == -1 ln.Transform.filter(id=transform_id).delete(permanent=True) def test_bulk_transform_permanent_delete_promotes_previous_version(): """Bulk permanent delete of latest in a version family promotes the previous version.""" v1 = ln.Transform(key="Bulk permanent delete version family").save() v2 = ln.Transform(revises=v1, key="Bulk permanent delete version family").save() assert v2.is_latest stem_uid = v1.stem_uid ln.Transform.filter(id=v2.id).delete(permanent=True) assert ln.Transform.filter(id=v2.id).count() == 0 v1_after = ln.Transform.filter(uid__startswith=stem_uid).one() assert v1_after.pk == v1.pk assert v1_after.is_latest v1.delete(permanent=True) def test_bulk_transform_soft_delete_promotes_previous_version(): """Bulk soft delete of latest in a version family promotes the previous version.""" v1 = ln.Transform(key="Bulk soft delete version family").save() v2 = ln.Transform(revises=v1, key="Bulk soft delete version family").save() assert v2.is_latest v2_id = v2.id stem_uid = v1.stem_uid ln.Transform.filter(id=v2_id).delete(permanent=False) v2_after = ln.Transform.filter(id=v2_id).one() assert v2_after.branch_id == -1 assert not v2_after.is_latest v1.refresh_from_db() assert v1.is_latest assert ln.Transform.filter(uid__startswith=stem_uid).get(is_latest=True) == v1 # Clean up v2_after.delete(permanent=True) v1.delete(permanent=True) ================================================ FILE: tests/core/test_transform_from_git.py ================================================ import lamindb as ln import pytest TEST_URL = "https://github.com/openproblems-bio/task_batch_integration" def test_transform_from_git(): # test auto-inferred latest commit hash transform1 = ln.Transform.from_git(url=TEST_URL, path="main.nf") assert transform1.source_code.startswith(f"""\ repo: {TEST_URL} path: main.nf commit:""") assert transform1.key == "openproblems-bio/task_batch_integration/main.nf" assert transform1.version_tag is None assert transform1.description is None assert transform1.reference.startswith(f"{TEST_URL}/blob/") assert transform1.reference_type == "url" # test checking out specific version transform2 = ln.Transform.from_git(url=TEST_URL, path="main.nf", version="v2.0.0") assert transform2.source_code.startswith(f"""\ repo: {TEST_URL} path: main.nf commit:""") assert transform2.version_tag == "v2.0.0" assert transform2.description is None assert transform1.source_code != transform2.source_code assert transform1.reference != transform2.reference # test with description transform2_with_desc = ln.Transform.from_git( url=TEST_URL, path="main.nf", version="v2.0.0", description="Test description" ) assert transform2_with_desc.description == "Test description" assert transform2_with_desc.version_tag == "v2.0.0" # test sliding transform from branch transform3 = ln.Transform.from_git( url=TEST_URL, path="main.nf", version="main", branch="main" ) assert transform3.source_code.startswith(f"""\ repo: {TEST_URL} path: main.nf branch:""") assert transform3.description is None assert transform3.reference == f"{TEST_URL}/tree/main/main.nf" assert transform3.reference_type == "url" def test_transform_from_git_with_entrypoint(): # test auto-inferred latest commit hash transform1 = ln.Transform.from_git( url=TEST_URL, path="main.nf", entrypoint="myentrypoint" ) assert transform1.source_code.startswith(f"""\ repo: {TEST_URL} path: main.nf entrypoint: myentrypoint commit:""") assert transform1.description is None # test with entrypoint and description transform2 = ln.Transform.from_git( url=TEST_URL, path="main.nf", entrypoint="myentrypoint", description="Entrypoint description", ) assert transform2.description == "Entrypoint description" def test_transform_custom_key_and_hash_lookup(): # test auto-inferred latest commit hash transform1 = ln.Transform.from_git( url=TEST_URL, path="main.nf", key="mypipeline" ).save() assert transform1.key == "mypipeline" # trigger hash look up transform2 = ln.Transform.from_git(url=TEST_URL, path="main.nf", key="mypipeline2") assert transform1 == transform2 assert transform2.key == "mypipeline" # trigger hash look up transform2 = ln.Transform.from_git( url=TEST_URL, path="main.nf", key="mypipeline2", skip_hash_lookup=True ) assert transform1 != transform2 assert transform2.key == "mypipeline2" transform1.delete(permanent=True) def test_transform_from_git_failure_modes(): # invalid tag with pytest.raises(ValueError) as error: ln.Transform.from_git( url=TEST_URL, path="main.nf", version="invalid", ) assert error.exconly().startswith("ValueError: Failed to checkout version invalid") # invalid branch with pytest.raises(ValueError) as error: ln.Transform.from_git( url=TEST_URL, path="main.nf", branch="invalid", ) assert error.exconly().startswith("ValueError: Failed to checkout branch invalid") ================================================ FILE: tests/core/test_view.py ================================================ import lamindb as ln def test_view(): ln.view(modules="core") ln.view() ================================================ FILE: tests/curators/conftest.py ================================================ import shutil from time import perf_counter import lamindb_setup as ln_setup import pytest def pytest_sessionstart(): t_execute_start = perf_counter() ln_setup.init(storage="./test-curators-db", modules="bionty") total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") def pytest_sessionfinish(session: pytest.Session): shutil.rmtree("./test-curators-db") ln_setup.delete("test-curators-db", force=True) @pytest.fixture def ccaplog(caplog): """Add caplog handler to our custom logger at session start.""" from lamin_utils._logger import logger logger.addHandler(caplog.handler) yield caplog logger.removeHandler(caplog.handler) ================================================ FILE: tests/curators/test_cellxgene_curation.py ================================================ from typing import Generator import bionty as bt import lamindb as ln import pytest @pytest.fixture def cellxgene_defaults() -> Generator: ln.examples.cellxgene.save_cellxgene_defaults() yield ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) ln.ULabel.filter(type__isnull=False).delete(permanent=True) for entity in [ bt.Disease, bt.Ethnicity, bt.DevelopmentalStage, bt.Phenotype, bt.CellType, ln.ULabel, ]: entity.filter().delete(permanent=True) def test_cellxgene_curation(cellxgene_defaults) -> None: """Tests validating a recent CELLxGENE dataset.""" ln.examples.cellxgene.save_cellxgene_defaults() cxg_schema = ln.examples.cellxgene.create_cellxgene_schema( field_types="ontology_id", organism="mouse", spatial_library_id="Thymus_Visium_Exp3A_V2S1_3wk_B6-WT", ) adata = ln.examples.datasets.anndata_visium_mouse_cellxgene() curator = ln.curators.AnnDataCurator(adata, cxg_schema) curator.validate() cxg_schema.delete(permanent=True) ================================================ FILE: tests/curators/test_curate_from_croissant.py ================================================ import shutil import lamindb as ln import pytest @pytest.mark.parametrize("filepath_prefix", [None, "test-curators-db/"]) def test_curate_artifact_from_croissant(filepath_prefix: str | None): croissant_path, dataset1_path = ln.examples.croissant.mini_immuno( n_files=1, filepath_prefix=filepath_prefix ) artifact1 = ln.integrations.curate_from_croissant(croissant_path) assert ( artifact1.description == "Mini immuno dataset - A few samples from the immunology dataset" ) assert artifact1.key == "mini_immuno.anndata.zarr" assert artifact1.version_tag == "1.0" assert ( artifact1._key_is_virtual if filepath_prefix is None else not artifact1._key_is_virtual ) license_label = artifact1.ulabels.get( name="https://creativecommons.org/licenses/by/4.0/" ) project_label = artifact1.projects.get(name="Mini Immuno Project") # now mutate the dataset and create a new version croissant_path, dataset1_path = ln.examples.croissant.mini_immuno( n_files=1, filepath_prefix=filepath_prefix, strip_version=True ) dummy_file_path = dataset1_path / "dummy_file.txt" dummy_file_path.write_text("dummy file") artifact2 = ln.integrations.curate_from_croissant(croissant_path) assert artifact2.description == artifact1.description assert artifact2.key == artifact1.key assert artifact2.version_tag is None assert artifact2.stem_uid == artifact1.stem_uid assert artifact2.uid != artifact1.uid assert ( artifact2._key_is_virtual if filepath_prefix is None else not artifact1._key_is_virtual ) license_label = artifact2.ulabels.get( name="https://creativecommons.org/licenses/by/4.0/" ) project_label = artifact2.projects.get(name="Mini Immuno Project") shutil.rmtree(dataset1_path) croissant_path.unlink() artifact1.delete(permanent=True, storage=True) # because of real storage key project_label.delete(permanent=True) license_label.delete(permanent=True) def test_curate_collection_from_croissant(): croissant_path, dataset1_path, dataset2_path = ln.examples.croissant.mini_immuno( n_files=2 ) collection = ln.integrations.curate_from_croissant(croissant_path) croissant_path.unlink() shutil.rmtree(dataset1_path) dataset2_path.unlink() artifact1 = collection.artifacts.get(key="mini_immuno.anndata.zarr") artifact2 = collection.artifacts.get(key="mini.csv") license_label = collection.ulabels.get( name="https://creativecommons.org/licenses/by/4.0/" ) project_label = collection.projects.get(name="Mini Immuno Project") collection.delete(permanent=True) artifact1.delete(permanent=True) artifact2.delete(permanent=True) project_label.delete(permanent=True) license_label.delete(permanent=True) ================================================ FILE: tests/curators/test_curators_examples.py ================================================ import sys from pathlib import Path docs_path = Path.cwd() / "docs" / "scripts" sys.path.append(str(docs_path)) import anndata as ad import bionty as bt import lamindb as ln import pandas as pd import pytest from lamindb.core import datasets from lamindb.errors import InvalidArgument, ValidationError @pytest.fixture(scope="module") def mini_immuno_schema(): # define labels perturbation = ln.ULabel(name="Perturbation", is_type=True).save() ln.ULabel(name="DMSO", type=perturbation).save() ln.ULabel(name="IFNG", type=perturbation).save() ln.ULabel(name="ulabel_but_not_perturbation").save() ln.ULabel.from_values(["sample1", "sample2", "sample3"], create=True).save() bt.CellType.from_source(name="B cell").save() bt.CellType.from_source(name="T cell").save() # in next iteration for attrs ln.Feature(name="temperature", dtype=float).save() # ln.Feature(name="experiment", dtype="cat[ULabel]").save() # ln.Feature(name="date_of_study", dtype="date").save() # ln.Feature(name="study_note", dtype="str").save() # define schema schema = ln.Schema( name="mini_immuno_obs_level_metadata_curator_tests", features=[ ln.Feature(name="perturbation", dtype=perturbation).save(), ln.Feature(name="sample_note", dtype=str).save(), ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(), ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(), ], index=ln.Feature(name="sample_label", dtype=ln.ULabel).save(), ).save() yield schema for af in ln.Artifact.filter(): af.delete(permanent=True) from lamindb.models import SchemaComponent SchemaComponent.filter().delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.Gene.filter().delete(permanent=True) ln.ULabel.filter(type__isnull=False).delete(permanent=True) ln.ULabel.filter().delete(permanent=True) bt.CellType.filter().delete(permanent=True) @pytest.fixture(scope="module") def curator_params(): """Common curator parameters.""" return { "categoricals": { "perturbation": ln.ULabel.name, "cell_type_by_expert": bt.CellType.name, "cell_type_by_model": bt.CellType.name, }, "organism": "human", } @pytest.fixture(scope="module") def mudata_papalexi21_subset_schema(): # define labels perturbation = ln.ULabel(name="Perturbation", is_type=True).save() ln.ULabel(name="Perturbed", type=perturbation).save() ln.ULabel(name="NT", type=perturbation).save() replicate = ln.ULabel(name="Replicate", is_type=True).save() ln.ULabel(name="rep1", type=replicate).save() ln.ULabel(name="rep2", type=replicate).save() ln.ULabel(name="rep3", type=replicate).save() # define obs schema obs_schema = ln.Schema( name="mudata_papalexi21_subset_obs_schema", features=[ ln.Feature(name="perturbation", dtype=perturbation).save(), ln.Feature(name="replicate", dtype=replicate).save(), ], ).save() obs_schema_rna = ln.Schema( name="mudata_papalexi21_subset_rna_obs_schema", features=[ ln.Feature(name="nCount_RNA", dtype=int).save(), ln.Feature(name="nFeature_RNA", dtype=int).save(), ln.Feature(name="percent.mito", dtype=float).save(), ], coerce=True, ).save() obs_schema_hto = ln.Schema( name="mudata_papalexi21_subset_hto_obs_schema", features=[ ln.Feature(name="nCount_HTO", dtype=int).save(), ln.Feature(name="nFeature_HTO", dtype=int).save(), ln.Feature(name="technique", dtype=bt.ExperimentalFactor).save(), ], coerce=True, ).save() var_schema_rna = ln.Schema( name="mudata_papalexi21_subset_rna_var_schema", itype=bt.Gene.symbol, dtype=float, ).save() # define composite schema mudata_schema = ln.Schema( name="mudata_papalexi21_subset_mudata_schema", otype="MuData", slots={ "obs": obs_schema, "rna:obs": obs_schema_rna, "hto:obs": obs_schema_hto, "rna:var": var_schema_rna, }, ).save() yield mudata_schema for af in ln.Artifact.filter(): af.delete(permanent=True) ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) bt.models.SchemaGene.filter().delete() bt.Gene.filter().delete(permanent=True) ln.ULabel.filter(type__isnull=False).delete(permanent=True) ln.ULabel.filter().delete(permanent=True) bt.ExperimentalFactor.filter().delete(permanent=True) @pytest.fixture(scope="module") def study_metadata_schema(): from define_schema_df_metadata import study_metadata_schema yield study_metadata_schema study_metadata_schema.delete(permanent=True) ln.Feature.filter().delete(permanent=True) @pytest.fixture(scope="module") def anndata_uns_schema(): from define_schema_anndata_uns import anndata_uns_schema yield anndata_uns_schema ln.Schema.filter().delete(permanent=True) ln.Feature.filter().delete(permanent=True) @pytest.fixture(scope="module") def spatialdata_blobs_schema(): from define_schema_spatialdata import sdata_schema yield sdata_schema for af in ln.Artifact.filter(): af.delete(permanent=True) from lamindb.models import SchemaComponent SchemaComponent.filter().delete(permanent=True) ln.Schema.filter().delete(permanent=True) bt.models.SchemaGene.filter().delete() bt.Gene.filter().delete(permanent=True) ln.ULabel.filter(type__isnull=False).delete(permanent=True) ln.ULabel.filter().delete(permanent=True) bt.ExperimentalFactor.filter().delete(permanent=True) bt.DevelopmentalStage.filter().delete(permanent=True) bt.Disease.filter().delete(permanent=True) def test_dataframe_curator(mini_immuno_schema: ln.Schema): """Test DataFrame curator implementation.""" # Get the perturbation ULabel (created in mini_immuno_schema fixture) perturbation = ln.ULabel.get(name="Perturbation", is_type=True) # invalid simple dtype (float) feature_to_fail = ln.Feature(name="treatment_time_h", dtype=float).save() schema = ln.Schema( name="mini_immuno_obs_level_metadata_v2", features=[ ln.Feature(name="perturbation", dtype=perturbation).save(), ln.Feature(name="sample_note", dtype=str).save(), ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(), ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(), feature_to_fail, ], ).save() df = datasets.mini_immuno.get_dataset1(otype="DataFrame") curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( "Column 'treatment_time_h' failed series or dataframe validator 0: " in error.exconly() ) schema.delete(permanent=True) feature_to_fail.delete(permanent=True) # Wrong subtype df = datasets.mini_immuno.get_dataset1(otype="DataFrame", with_wrong_subtype=True) curator = ln.curators.DataFrameCurator(df, mini_immuno_schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( error.exconly() == """lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'ulabel_but_not_perturbation' → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation') → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']""" ) # Typo df = datasets.mini_immuno.get_dataset1(otype="DataFrame", with_typo=True) curator = ln.curators.DataFrameCurator(df, mini_immuno_schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( error.exconly() == """lamindb.errors.ValidationError: 1 term not validated in feature 'perturbation': 'IFNJ' → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('perturbation') → a valid label for subtype 'Perturbation' has to be one of ['DMSO', 'IFNG']""" ) df = datasets.mini_immuno.get_dataset1(otype="DataFrame") curator = ln.curators.DataFrameCurator(df, mini_immuno_schema) artifact = curator.save_artifact(key="examples/dataset1.parquet") assert artifact.schema == mini_immuno_schema assert artifact.features.slots["columns"].n_members == 5 assert ( artifact.features.describe(return_str=True) == """\ Artifact: examples/dataset1.parquet (0000) └── Dataset features └── columns (5) cell_type_by_expe… bionty.CellType B cell, CD8-positive, alph… cell_type_by_model bionty.CellType B cell, T cell perturbation ULabel[Perturbation] DMSO, IFNG sample_label ULabel sample1, sample2, sample3 sample_note str""" ) assert set(artifact.features.get_values()["sample_label"]) == { "sample1", "sample2", "sample3", } assert set(artifact.features.get_values()["cell_type_by_expert"]) == { "CD8-positive, alpha-beta T cell", "B cell", } assert set(artifact.features.get_values()["cell_type_by_model"]) == { "T cell", "B cell", } # a second dataset with missing values ln.ULabel.from_values(["sample4", "sample5", "sample6"], create=True).save() df = ln.examples.datasets.mini_immuno.get_dataset2( otype="DataFrame", gene_symbols_in_index=True ) curator = ln.curators.DataFrameCurator(df, mini_immuno_schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert "column 'sample_note' not in dataframe" in error.exconly() assert "column 'cell_type_by_expert' not in dataframe" in error.exconly() curator.standardize() curator.validate() artifact.delete(permanent=True) def test_dataframe_curator_index(): """Test validating a DataFrame index.""" df = datasets.mini_immuno.get_dataset1( otype="DataFrame", with_index_type_mismatch=True ) feature = ln.Feature(name="test", dtype="str").save() schema = ln.Schema(index=feature).save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert "expected series 'None' to have type str" in error.exconly() schema.delete(permanent=True) feature.delete(permanent=True) def test_dataframe_curator_validate_all_annotate_cat(mini_immuno_schema): """Do not pass any features.""" schema = ln.Schema(itype=ln.Feature).save() assert schema.flexible df = datasets.mini_immuno.get_dataset1(otype="DataFrame") artifact = ln.Artifact.from_dataframe( df, key="examples/dataset1.parquet", schema=schema ).save() assert set(artifact.features.get_values()["perturbation"]) == { "DMSO", "IFNG", } assert set(artifact.features.get_values()["cell_type_by_expert"]) == { "CD8-positive, alpha-beta T cell", "B cell", } assert set(artifact.features.get_values()["cell_type_by_model"]) == { "T cell", "B cell", } artifact.delete(permanent=True) schema.delete(permanent=True) def test_same_name_different_type(): """The same feature names are allowed as long as they have different feature types.""" type_a = ln.Feature( name="TypeA", is_type=True, description="Type A features" ).save() type_b = ln.Feature( name="TypeB", is_type=True, description="Type B features" ).save() assay_a = ln.Feature(name="assay name", type=type_a, dtype=str).save() assay_b = ln.Feature(name="assay name", type=type_b, dtype=str).save() schema = ln.Schema( name="schema_a", features=[ln.Feature.get(name="assay name", type=type_a)], flexible=True, otype="DataFrame", ).save() df = pd.DataFrame({"assay name": ["exp1", "exp2"]}) artifact = ln.Artifact.from_dataframe(df, description="testdata").save() curator = ln.curators.DataFrameCurator(artifact, schema) curator.save_artifact() artifact.delete(permanent=True) ln.Schema.filter(features__name="assay name").delete(permanent=True) schema.delete(permanent=True) for feat in [assay_a, assay_b, type_a, type_b]: feat.delete(permanent=True) def test_dataframe_curator_validate_all_annotate_cat2(mini_immuno_schema): """Combine half-specifying features, half not.""" schema = ln.Schema( itype=ln.Feature, features=[ln.Feature.get(name="perturbation")], flexible=True, ).save() assert schema.flexible df = datasets.mini_immuno.get_dataset1(otype="DataFrame") curator = ln.curators.DataFrameCurator(df, schema) artifact = curator.save_artifact(key="examples/dataset1.parquet") assert set(artifact.features.get_values()["perturbation"]) == { "DMSO", "IFNG", } assert set(artifact.features.get_values()["cell_type_by_expert"]) == { "CD8-positive, alpha-beta T cell", "B cell", } assert set(artifact.features.get_values()["cell_type_by_model"]) == { "T cell", "B cell", } artifact.delete(permanent=True) schema.delete(permanent=True) @pytest.mark.parametrize("include_attrs_slot", [True, False]) def test_dataframe_attrs_validation(study_metadata_schema, include_attrs_slot): df = datasets.mini_immuno.get_dataset1(otype="DataFrame") perturbation = ln.ULabel(name="Perturbation", is_type=True).save() perturbation_feature = ln.Feature(name="perturbation", dtype=perturbation).save() ln.ULabel(name="DMSO", type=perturbation).save() ln.ULabel(name="IFNG", type=perturbation).save() if include_attrs_slot: schema = ln.Schema( features=[perturbation_feature], slots={"attrs": study_metadata_schema}, otype="DataFrame", ).save() else: schema = ln.Schema( features=[perturbation_feature], otype="DataFrame", ).save() bad_schema = ln.Schema( features=[perturbation_feature], slots={"doesnotexist": schema}, otype="DataFrame", ).save() with pytest.raises(ValueError) as e: curator = ln.curators.DataFrameCurator(df, schema=bad_schema) assert ( "Slot 'doesnotexist' is not supported for DataFrameCurator. Must be 'attrs'." in str(e.value) ) curator = ln.curators.DataFrameCurator(df, schema=schema) if include_attrs_slot: assert curator.slots["attrs"].__class__.__name__ == "ComponentCurator" else: assert not curator.slots curator.validate() artifact = curator.save_artifact(key="examples/df_with_attrs.parquet") assert artifact.schema == schema if include_attrs_slot: assert "attrs" in artifact.features.slots assert artifact.features.slots["attrs"].features.first() == ln.Feature.get( name="temperature" ) assert artifact.features.slots["attrs"].features.last() == ln.Feature.get( name="experiment" ) else: assert ( not hasattr(artifact.features, "slots") or "attrs" not in artifact.features.slots ) from lamindb.models import SchemaComponent SchemaComponent.filter().delete(permanent=True) artifact.delete(permanent=True) bad_schema.delete(permanent=True) schema.delete(permanent=True) def test_schema_new_genes(ccaplog): df = pd.DataFrame( index=pd.Index( [ "ENSG00000139618", # BRCA2 "ENSG00000141510", # TP53 "ENSG00999000001", # Invalid ID "ENSG00999000002", # Invalid ID ], name="ensembl", ) ) feature = ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save() schema = ln.Schema(index=feature).save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert error.exconly().startswith( "lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'" ) assert ( "2 terms not validated in feature 'index': 'ENSG00999000001', 'ENSG00999000002'" in ccaplog.text ) schema.delete(permanent=True) feature.delete(permanent=True) def test_schema_no_match_ensembl(): df = pd.DataFrame( index=pd.Index( [ "ENSG99999999998", # Invalid ID "ENSG99999999999", # Invalid ID ], name="ensembl", ) ) schema = ln.Schema( index=ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save() ).save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert ( error.exconly() == """lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'ENSG99999999998', 'ENSG99999999999' → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('index')""" ) schema.delete(permanent=True) def test_schema_mixed_ensembl_symbols(ccaplog): """Quite some datasets have mixed ensembl gene IDs and symbols. The expected behavior is that an error is raised when such a dataset is encountered because currently LaminDB does not support validating values against a union of Fields. The current behavior is that these cases automatically pass. """ df = pd.DataFrame( index=pd.Index( [ "ENSG00000139618", "ENSG00000141510", "BRCA2", # symbol "TP53", # symbol ], name="ensembl", ) ) schema = ln.Schema( index=ln.Feature(name="ensembl", dtype=bt.Gene.ensembl_gene_id).save() ).save() curator = ln.curators.DataFrameCurator(df, schema) with pytest.raises(ln.errors.ValidationError) as error: curator.validate() assert error.exconly().startswith( "lamindb.errors.ValidationError: 2 terms not validated in feature 'index': 'BRCA2', 'TP53'" ) assert "2 terms not validated in feature 'index': 'BRCA2', 'TP53'" in ccaplog.text schema.delete(permanent=True) def test_schema_mixed_features(ccaplog): """Test that union dtype features validate against multiple registries.""" mixed_feature = ln.Feature( name="mixed_feature", dtype="cat[bionty.Tissue.ontology_id|bionty.CellType.ontology_id]", ).save() df_mixed = pd.DataFrame({"mixed_feature": ["UBERON:0000178", "CL:0000540"]}) mixed_schema = ln.Schema(features=[mixed_feature], coerce=True).save() mixed_curator = ln.curators.DataFrameCurator(df_mixed, mixed_schema) mixed_curator.validate() assert mixed_curator._is_validated assert bt.CellType.filter(ontology_id="CL:0000540").exists() assert bt.Tissue.filter(ontology_id="UBERON:0000178").exists() df_invalid = pd.DataFrame({"mixed_feature": ["INVALID:0000000"]}) invalid_curator = ln.curators.DataFrameCurator(df_invalid, mixed_schema) with pytest.raises(ln.errors.ValidationError): invalid_curator.validate() mixed_schema.delete(permanent=True) mixed_feature.delete(permanent=True) def test_anndata_curator_different_components(mini_immuno_schema: ln.Schema): obs_schema = mini_immuno_schema for add_comp in ["var.T", "obs", "uns"]: var_schema = ln.Schema( name="scRNA_seq_var_schema", itype=bt.Gene.ensembl_gene_id, dtype="num", ).save() # always assume var components = {"var.T": var_schema} if add_comp == "obs": components["obs"] = obs_schema if add_comp == "uns": uns_schema = ln.Schema( name="flexible_uns_schema", itype=ln.Feature, ).save() components["uns"] = uns_schema anndata_schema = ln.Schema( name="mini_immuno_anndata_schema", otype="AnnData", slots=components, ).save() assert mini_immuno_schema.id is not None, mini_immuno_schema assert anndata_schema.slots["var.T"] == var_schema if add_comp == "obs": assert anndata_schema.slots["obs"] == obs_schema if add_comp == "uns": assert anndata_schema.slots["uns"] == uns_schema describe_output = anndata_schema.describe(return_str=True) assert "mini_immuno_anndata_schema" in describe_output assert "scRNA_seq_var_schema" in describe_output if add_comp == "obs": assert "mini_immuno_anndata_schema" in describe_output if add_comp == "uns": assert "flexible_uns_schema" in describe_output adata = datasets.mini_immuno.get_dataset1(otype="AnnData") curator = ln.curators.AnnDataCurator(adata, anndata_schema) assert curator.slots["var.T"].__class__.__name__ == "ComponentCurator" if add_comp == "obs": assert curator.slots["obs"].__class__.__name__ == "ComponentCurator" if add_comp == "uns": assert curator.slots["uns"].__class__.__name__ == "ComponentCurator" artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=anndata_schema ) assert artifact._curator._is_validated # important test, do not remove artifact.save() assert not hasattr(artifact, "_curator") # test that curator is deleted assert artifact.schema == anndata_schema assert artifact.features.slots["var.T"].n_members == 3 # 3 genes get linked if add_comp == "obs": assert artifact.features.slots["obs"] == obs_schema assert set(artifact.features.get_values()["cell_type_by_expert"]) == { "CD8-positive, alpha-beta T cell", "B cell", } assert set(artifact.features.get_values()["cell_type_by_model"]) == { "T cell", "B cell", } if add_comp == "uns": assert artifact.features.slots["uns"].features.first() == ln.Feature.get( name="temperature" ) artifact.delete(permanent=True) anndata_schema.delete(permanent=True) var_schema.delete(permanent=True) def test_anndata_curator_varT_curation(): ln.Schema.filter(itype="bionty.Gene.ensembl_gene_id").delete() varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save() slot = "var.T" components = {slot: varT_schema} anndata_schema = ln.Schema( otype="AnnData", slots=components, ).save() for with_gene_typo in [True, False]: adata = datasets.mini_immuno.get_dataset1( otype="AnnData", with_gene_typo=with_gene_typo ) if with_gene_typo: with pytest.raises(ValidationError) as error: artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=anndata_schema ).save() assert error.exconly() == ( f"lamindb.errors.ValidationError: 1 term not validated in feature 'columns' in slot '{slot}': 'GeneTypo'\n" f" → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('columns')" ) else: for n_max_records in [2, 4]: ln.settings.annotation.n_max_records = n_max_records artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=anndata_schema ).save() assert ( artifact.features.slots[slot].n_members == 3 ) # 3 genes get linked assert ( artifact.features.slots[slot].itype == "bionty.Gene.ensembl_gene_id" ) if n_max_records == 2: assert not artifact.features.slots[slot].members.exists() else: assert set( artifact.features.slots[slot] .members.to_dataframe()["ensembl_gene_id"] .tolist() ) == { "ENSG00000153563", "ENSG00000010610", "ENSG00000170458", } artifact.delete(permanent=True) anndata_schema.delete(permanent=True) varT_schema.delete(permanent=True) def test_anndata_curator_varT_curation_legacy(ccaplog): varT_schema = ln.Schema(itype=bt.Gene.ensembl_gene_id, maximal_set=True).save() slot = "var" components = {slot: varT_schema} anndata_schema = ln.Schema( otype="AnnData", slots=components, ).save() for with_gene_typo in [True, False]: adata = datasets.mini_immuno.get_dataset1( otype="AnnData", with_gene_typo=with_gene_typo ) if with_gene_typo: with pytest.raises(ValidationError) as error: artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=anndata_schema ).save() assert error.exconly() == ( f"lamindb.errors.ValidationError: 1 term not validated in feature 'var_index' in slot '{slot}': 'GeneTypo'\n" f" → fix typos, remove non-existent values, or save terms via: curator.slots['{slot}'].cat.add_new_from('var_index')" ) else: artifact = ln.Artifact.from_anndata( adata, key="examples/dataset1.h5ad", schema=anndata_schema ).save() assert ( "auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}" in ccaplog.text ) assert artifact.features.slots[slot].n_members == 3 # 3 genes get linked assert set( artifact.features.slots[slot].members.to_dataframe()["ensembl_gene_id"] ) == { "ENSG00000153563", "ENSG00000010610", "ENSG00000170458", } artifact.delete(permanent=True) anndata_schema.delete(permanent=True) varT_schema.delete(permanent=True) def test_anndata_curator_nested_uns(study_metadata_schema, anndata_uns_schema): """Test AnnDataCurator with nested uns slot validation.""" adata = datasets.mini_immuno.get_dataset1(otype="AnnData") adata.uns["study_metadata"] = adata.uns.copy() curator = ln.curators.AnnDataCurator(adata, anndata_uns_schema) assert curator.slots["uns:study_metadata"].__class__.__name__ == "ComponentCurator" curator.validate() artifact = curator.save_artifact(key="examples/anndata_with_uns.h5ad") assert artifact.schema == anndata_uns_schema assert "uns:study_metadata" in artifact.features.slots assert artifact.features.slots[ "uns:study_metadata" ].features.first() == ln.Feature.get(name="temperature") adata = datasets.mini_immuno.get_dataset1(otype="AnnData") bad_schema1 = ln.Schema( otype="AnnData", slots={"uns:nonexistent": study_metadata_schema}, ).save() with pytest.raises(InvalidArgument) as e: ln.curators.AnnDataCurator(adata, bad_schema1) assert ( "Schema slot 'uns:nonexistent' requires keys uns['nonexistent'] but key 'nonexistent' not found." in str(e.value) ) with pytest.raises(InvalidArgument) as e: bad_schema2 = ln.Schema( otype="AnnData", slots={"uns:temperature:nonexistent_nested": study_metadata_schema}, ).save() ln.curators.AnnDataCurator(adata, bad_schema2) assert ( "Schema slot 'uns:temperature:nonexistent_nested' requires keys uns['temperature']['nonexistent_nested'] but key 'nonexistent_nested' not found. Available keys at this level: none (not a dict)." in str(e.value) ) inferred_sets = artifact.schemas.all() for inferred_set in inferred_sets: artifact.schemas.remove(inferred_set) artifact.delete(permanent=True) bad_schema1.delete(permanent=True) bad_schema2.delete(permanent=True) anndata_uns_schema.delete(permanent=True) def test_anndata_curator_no_var(mini_immuno_schema: ln.Schema): assert mini_immuno_schema.id is not None, mini_immuno_schema # test no var schema anndata_schema_no_var = ln.Schema( name="mini_immuno_anndata_schema_no_var", otype="AnnData", slots={"obs": mini_immuno_schema}, ).save() assert mini_immuno_schema.id is not None, mini_immuno_schema adata = datasets.mini_immuno.get_dataset1(otype="AnnData") curator = ln.curators.AnnDataCurator(adata, anndata_schema_no_var) artifact = curator.save_artifact(key="examples/dataset1_no_var.h5ad") artifact.delete(permanent=True) anndata_schema_no_var.delete(permanent=True) def test_mudata_curator( mudata_papalexi21_subset_schema: ln.Schema, mini_immuno_schema: ln.Schema ): mudata_schema = mudata_papalexi21_subset_schema mdata = ln.examples.datasets.mudata_papalexi21_subset() # wrong dataset with pytest.raises(InvalidArgument): ln.curators.MuDataCurator(pd.DataFrame(), mudata_schema) # wrong schema with pytest.raises(InvalidArgument): ln.curators.MuDataCurator(mdata, mini_immuno_schema) try: # TODO: allow set cat_filters for a Schema with itype bt.settings.organism = "human" curator = ln.curators.MuDataCurator(mdata, mudata_schema) assert curator.slots.keys() == { "obs", "rna:obs", "hto:obs", "rna:var", } curator.validate() curator.slots["rna:var"].cat.standardize("columns") curator.slots["rna:var"].cat.add_new_from("columns") artifact = curator.save_artifact(key="mudata_papalexi21_subset.h5mu") assert artifact.schema == mudata_schema assert set(artifact.features.slots.keys()) == { "obs", "rna:var", "rna:obs", "hto:obs", } artifact.delete(permanent=True) mudata_schema.delete(permanent=True) mini_immuno_schema.delete(permanent=True) Path("papalexi21_subset.h5mu").unlink(missing_ok=True) finally: bt.settings.organism = None def test_mudata_curator_nested_uns(study_metadata_schema): """Test MuData with nested uns slot validation. This test verifies the behavior of both the MuData `.uns` slots and a `.uns` slot of an AnnData object inside the MuData object that gets specified using the key `:` syntax. """ mdata = ln.examples.datasets.mudata_papalexi21_subset(with_uns=True) site_uns_schema = ln.Schema( features=[ ln.Feature(name="pos", dtype=float).save(), ln.Feature(name="site_id", dtype=str).save(), ] ).save() mdata_schema = ln.Schema( otype="MuData", slots={ "uns:study_metadata": study_metadata_schema, "rna:uns:site_metadata": site_uns_schema, }, ).save() curator = ln.curators.MuDataCurator(mdata, mdata_schema) assert curator.slots["uns:study_metadata"].__class__.__name__ == "ComponentCurator" assert ( curator.slots["rna:uns:site_metadata"].__class__.__name__ == "ComponentCurator" ) curator.validate() artifact = curator.save_artifact(key="examples/mdata_with_uns.h5mu") assert artifact.schema == mdata_schema assert "uns:study_metadata" in artifact.features.slots assert "rna:uns:site_metadata" in artifact.features.slots assert artifact.features.slots[ "uns:study_metadata" ].features.first() == ln.Feature.get(name="temperature") assert artifact.features.slots[ "rna:uns:site_metadata" ].features.first() == ln.Feature.get(name="pos") # Clean up artifact.delete(permanent=True) Path("papalexi21_subset.h5mu").unlink(missing_ok=True) def test_spatialdata_curator( spatialdata_blobs_schema: ln.Schema, ): spatialdata = ln.examples.datasets.spatialdata_blobs() # wrong dataset with pytest.raises(InvalidArgument): ln.curators.SpatialDataCurator(pd.DataFrame(), spatialdata_blobs_schema) # wrong schema - use an actual slot that exists with pytest.raises(InvalidArgument): ln.curators.SpatialDataCurator( spatialdata, spatialdata_blobs_schema.slots["attrs:bio"] ) curator = ln.curators.SpatialDataCurator(spatialdata, spatialdata_blobs_schema) with pytest.raises(ln.errors.ValidationError): curator.validate() spatialdata.tables["table"].var.drop(index="ENSG00000999999", inplace=True) artifact = ln.Artifact.from_spatialdata( spatialdata, key="examples/spatialdata1.zarr", schema=spatialdata_blobs_schema, ).save() assert artifact.schema == spatialdata_blobs_schema assert artifact.features.slots.keys() == { "attrs:bio", "attrs:tech", "attrs", "tables:table:obs", "tables:table:var.T", } assert artifact.features.get_values()["disease"] == "Alzheimer disease" assert ( artifact.features.describe(return_str=True) == """Artifact: examples/spatialdata1.zarr (0000) └── Dataset features ├── attrs:bio (2) │ developmental_sta… bionty.DevelopmentalSt… adult stage │ disease bionty.Disease Alzheimer disease ├── attrs:tech (1) │ assay bionty.ExperimentalFac… Visium Spatial Gene Expres… ├── attrs (2) │ bio dict │ tech dict ├── tables:table:obs … │ sample_region str └── tables:table:var.… BRAF num BRCA2 num""" ) artifact.delete(permanent=True) def test_specific_source(): """Test validation of ontology terms using cat_filters to specify organism-specific source.""" obs_schema = ln.Schema( features=[ ln.Feature( name="developmental_stage_ontology_id", dtype=bt.DevelopmentalStage.ontology_id, cat_filters={ "source": bt.Source.filter( entity="bionty.DevelopmentalStage", organism="mouse" ).one() }, ).save() ], coerce=True, minimal_set=False, ).save() schema = ln.Schema( slots={"obs": obs_schema}, otype="AnnData", minimal_set=True, coerce=True ).save() adata = ad.AnnData( obs=pd.DataFrame( { "developmental_stage_ontology_id": [ "MmusDv:0000142", "MmusDv:0000022", ] } ), var=pd.DataFrame(index=["ENSMUSG00000022391", "ENSMUSG00000018569"]), ) curator = ln.curators.AnnDataCurator(adata, schema) curator.validate() schema.delete(permanent=True) ================================================ FILE: tests/curators/test_curators_remote.py ================================================ import lamindb as ln def test_curator_remote(): lamindata_artifacts = ln.Artifact.connect("laminlabs/lamindata") curator = ln.curators.DataFrameCurator( lamindata_artifacts.get("Ywz5JiVNHOWSJDiK"), schema=ln.examples.schemas.valid_features(), ) curator.validate() ================================================ FILE: tests/curators/test_dataframe_curation.py ================================================ """Test suite for accounting on bank transactions.""" import datetime import lamindb as ln import pandas as pd import pytest @pytest.fixture(scope="module") def transactions_schema(): # Labels currency_type = ln.ULabel(name="Currency", is_type=True).save() usd = ln.ULabel(name="USD", type=currency_type).save() eur = ln.ULabel(name="EUR", type=currency_type).save() assert usd.type == currency_type assert eur.type == currency_type # Features currency = ln.Feature(name="currency_name", dtype="cat[ULabel[Currency]]").save() date = ln.Feature(name="date", dtype="date").save() receipt_url = ln.Feature(name="receipt_url", dtype="url").save() transaction_type = ln.Feature(name="Transaction", is_type=True).save() amount_usd = ln.Feature( name="transaction_amount_usd_cent", dtype=int, type=transaction_type ).save() amount_eur = ln.Feature( name="transaction_amount_eur_cent", dtype=int, type=transaction_type ).save() # Schema schema = ln.Schema( name="transaction_dataframe", otype="DataFrame", features=[ date, amount_usd, amount_eur, currency, receipt_url, ], coerce=True, ).save() yield schema ln.Schema.filter( features__name__in=[ "transaction_amount_eur_cent", "transaction_amount_usd_cent", ] ).delete(permanent=True) schema.delete(permanent=True) amount_eur.delete(permanent=True) amount_usd.delete(permanent=True) transaction_type.delete(permanent=True) date.delete(permanent=True) receipt_url.delete(permanent=True) currency.delete(permanent=True) eur.delete(permanent=True) usd.delete(permanent=True) currency_type.delete(permanent=True) @pytest.fixture def transactions_dataframe(): # Create sample data data = { "date": [ datetime.date(2024, 1, 1), datetime.date(2024, 1, 2), datetime.date(2024, 1, 3), datetime.date(2024, 1, 4), datetime.date(2024, 1, 5), ], "transaction_amount_usd_cent": [1000, 2000, 3000, 4000, 5000], "transaction_amount_eur_cent": [850, 1700, 2550, 3400, 4250], "currency_name": ["USD", "EUR", "USD", "EUR", "USD"], "receipt_url": [ "https://bank.example/tx/1", "https://bank.example/tx/2", "https://bank.example/tx/3", "https://bank.example/tx/4", "https://bank.example/tx/5", ], } return pd.DataFrame(data) def test_schema_creation(transactions_schema): """Test if schema was created properly""" schema = ln.Schema.get(name="transaction_dataframe") assert schema is not None assert schema.otype == "DataFrame" # check the order of the features assert schema.members.to_list("name") == [ "date", "transaction_amount_usd_cent", "transaction_amount_eur_cent", "currency_name", "receipt_url", ] def test_data_curation( transactions_schema: ln.Schema, transactions_dataframe: ln.Schema ): """Test if data curation works properly""" curator = ln.curators.DataFrameCurator(transactions_dataframe, transactions_schema) assert curator.validate() is None # URLs are currently validated as string values. assert transactions_dataframe["receipt_url"].iloc[0] == "https://bank.example/tx/1" artifact = curator.save_artifact(key="test_transaction_dataset.csv") assert artifact.suffix == ".csv" artifact.delete(permanent=True) def test_missing_required_feature(transactions_schema: ln.Schema): """Test if validation fails for invalid data""" data_missing_required_feature = { "date": [datetime.date(2024, 1, 1)], "transaction_amount_usd_cent": [1000], "currency_name": ["USD"], "receipt_url": ["https://bank.example/tx/1"], } invalid_df = pd.DataFrame(data_missing_required_feature) schema = ln.Schema.get(name="transaction_dataframe") curator = ln.curators.DataFrameCurator(invalid_df, schema) with pytest.raises(ln.errors.ValidationError) as err: curator.validate() message = "column 'transaction_amount_eur_cent' not in dataframe. Columns in dataframe: ['date', 'transaction_amount_usd_cent', 'currency_name']" assert message in str(err) def test_invalid_label(transactions_schema: ln.Schema): """Test if validation fails for invalid currency""" # Create dataframe with invalid currency invalid_data = { "date": [datetime.date(2024, 1, 1)], "transaction_amount_usd_cent": [1000], "transaction_amount_eur_cent": [850], "currency_name": ["GBP"], # Invalid currency not in our labels "receipt_url": ["https://bank.example/tx/1"], } invalid_df = pd.DataFrame(invalid_data) schema = ln.Schema.get(name="transaction_dataframe") curator = ln.curators.DataFrameCurator(invalid_df, schema) with pytest.raises(ln.errors.ValidationError): curator.validate() # exconly = """lamindb.errors.ValidationError: 1 term is not validated: 'GBP' # → fix typos, remove non-existent values, or save terms via .add_new_from("currency_name")""" # assert err.exconly() == exconly def test_invalid_url_dtype(transactions_schema: ln.Schema): """Test if validation fails for non-string URL values.""" invalid_data = { "date": [datetime.date(2024, 1, 1)], "transaction_amount_usd_cent": [1000], "transaction_amount_eur_cent": [850], "currency_name": ["USD"], "receipt_url": [123], # URL is currently validated as string dtype } invalid_df = pd.DataFrame(invalid_data) schema = ln.Schema.get(name="transaction_dataframe") curator = ln.curators.DataFrameCurator(invalid_df, schema) with pytest.raises(ln.errors.ValidationError) as err: curator.validate() assert "receipt_url" in str(err.value) ================================================ FILE: tests/integrations/conftest.py ================================================ import shutil from time import perf_counter import lamindb_setup as ln_setup import pytest def pytest_sessionstart(): t_execute_start = perf_counter() ln_setup.init(storage="./testdb-integrations") total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") def pytest_sessionfinish(session: pytest.Session): shutil.rmtree("./testdb-integrations") ln_setup.delete("testdb-integrations", force=True) @pytest.fixture def ccaplog(caplog): """Add caplog handler to our custom logger at session start.""" from lamin_utils._logger import logger logger.addHandler(caplog.handler) yield caplog logger.removeHandler(caplog.handler) ================================================ FILE: tests/integrations/test_lightning.py ================================================ import json import shutil from pathlib import Path from typing import Any, Generator, cast from unittest.mock import MagicMock import lamindb as ln import lightning as pl import pytest import torch from django.db import connection from django.test.utils import CaptureQueriesContext from lamindb.integrations import lightning as ll from lamindb.models._feature_manager import FeatureManager from torch import nn from torch.utils.data import DataLoader, TensorDataset @pytest.fixture(autouse=True) def cleanup_checkpoints() -> Generator[None, None, None]: """Clean up checkpoint files and directories after each test.""" yield checkpoints_dir = Path("checkpoints") if checkpoints_dir.exists(): shutil.rmtree(checkpoints_dir) @pytest.fixture(autouse=True, scope="session") def cleanup_test_dir() -> Generator[None, None, None]: """Clean up test directory after all tests.""" yield for dirname in ("lightning_checkpoints", "test_lightning", "lightning_logs"): dirpath = Path(dirname) if dirpath.exists(): shutil.rmtree(dirpath) @pytest.fixture def simple_model() -> pl.LightningModule: class SimpleModel(pl.LightningModule): def __init__(self): super().__init__() self.layer = nn.Linear(10, 1) def forward(self, x): return self.layer(x) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters()) return SimpleModel() @pytest.fixture def dataloader() -> DataLoader: return DataLoader( TensorDataset(torch.randn(100, 10), torch.randn(100, 1)), batch_size=10 ) @pytest.fixture def dirpath(request: pytest.FixtureRequest) -> Generator[str, None, None]: prefix = f"lightning_checkpoints/{request.node.name}/" yield prefix for af in ln.Artifact.filter(key__startswith=prefix): af.delete(permanent=True, storage=True) dirpath_path = Path(prefix) if dirpath_path.exists(): shutil.rmtree(dirpath_path) @pytest.fixture(scope="session") def lightning_features() -> Generator[None, None, None]: """Create lightning features.""" ll.save_lightning_features() yield if lightning_type := ln.Feature.filter(name="lamindb.lightning").one_or_none(): for feat in ln.Feature.filter(type=lightning_type): for af in ln.Artifact.filter(schemas__features=feat): af.delete(permanent=True, storage=True) # JSONValues are lingering and also need to be deleted ln.models.RunJsonValue.filter(jsonvalue__feature=feat).delete( permanent=True ) ln.models.JsonValue.filter(feature=feat).delete(permanent=True) feat.delete(permanent=True) def test_checkpoint_basic( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, ): """Checkpoint should create artifacts with semantic paths.""" callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss") trainer = pl.Trainer( max_epochs=2, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 for af in artifacts: assert af.kind == "model" assert af.key.startswith(prefix) def test_checkpoint_with_features( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, ): """Checkpoint should annotate artifacts with feature values.""" ln.Feature(name="train_loss", dtype=float).save() ln.Feature(name="custom_param", dtype=str).save() ln.track() callback = ll.Checkpoint( dirpath=dirpath, features={ "artifact": {"train_loss": None}, "run": {"custom_param": "test_value"}, }, monitor="train_loss", ) trainer = pl.Trainer( max_epochs=2, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 for af in artifacts: values = af.features.get_values() assert "train_loss" in values assert ln.context.run.features.get_values()["custom_param"] == "test_value" ln.finish() def test_checkpoint_missing_features( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, ): """Checkpoint should raise an error when specified features do not exist.""" callback = ll.Checkpoint( dirpath=dirpath, features={"artifact": {"nonexistent_feature": None}}, monitor="train_loss", ) trainer = pl.Trainer( max_epochs=1, callbacks=[callback], logger=False, ) with pytest.raises(ValueError, match="Feature nonexistent_feature missing"): trainer.fit(simple_model, dataloader) def test_checkpoint_auto_features( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Checkpoint should auto-track lightning features if they exist.""" callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=2, ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 for af in artifacts: values = af.features.get_values() assert "is_best_model" in values assert "is_last_model" in values assert "score" in values assert "model_rank" in values def test_checkpoint_auto_features_with_duplicate_score_name( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Auto-tracking should work if a generic 'score' feature also exists.""" ln.Feature(name="score", dtype=float).save() callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=2, ) trainer = pl.Trainer( max_epochs=1, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 def test_checkpoint_best_model_with_duplicate_feature_names( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Clearing best-model flags should work when duplicate feature names exist. Regression test: when a Feature named 'is_best_model' exists both under the lamindb.lightning type and without a type (or under a different type), remove_values used to call Feature.get(name=...) which raised MultipleObjectsReturned. The fix uses type-scoped Feature lookups. """ # Create a duplicate 'is_best_model' feature without the lightning type ln.Feature(name="is_best_model", dtype=bool).save() callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=2, mode="min", ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=False, ) # This would raise MultipleObjectsReturned before the fix trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 best_count = sum( 1 for af in artifacts if af.features.get_values().get("is_best_model") is True ) assert best_count == 1 last_count = sum( 1 for af in artifacts if af.features.get_values().get("is_last_model") is True ) assert last_count == 1 def test_checkpoint_query_budget_scales_sublinearly_with_hparams( dataloader: DataLoader, dirpath: str, lightning_features: None ): """DB queries should not scale linearly with hparam count.""" class ModelWithManyHparams(pl.LightningModule): def __init__(self, n_hparams: int): super().__init__() self.layer = nn.Linear(10, 1) self.save_hyperparameters({f"hp_{i}": i for i in range(n_hparams)}) def forward(self, x): return self.layer(x) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters()) def count_fit_queries(n_hparams: int) -> int: model = ModelWithManyHparams(n_hparams) callback = ll.Checkpoint( dirpath=f"{dirpath.rstrip('/')}/{n_hparams}/", monitor="train_loss" ) trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False) with CaptureQueriesContext(connection) as ctx: trainer.fit(model, dataloader) return len(ctx.captured_queries) low_hparams_queries = count_fit_queries(2) high_hparams_queries = count_fit_queries(40) assert high_hparams_queries <= low_hparams_queries + 10 def test_model_rank_update_query_budget( dirpath: str, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, lightning_features: None, ): """Ranking should use batched feature reads.""" callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss", mode="min") # Provide a stub trainer so checkpoint_key_prefix can compute on-the-fly. # Only _original_dirpath matters for key derivation here. stub_trainer = MagicMock(spec=pl.Trainer) stub_trainer.loggers = [] callback._trainer = stub_trainer key_prefix = callback.checkpoint_key_prefix created_artifacts = [] for i in range(8): model_file = tmp_path / f"model_{i}.ckpt" model_file.write_bytes(f"checkpoint-{i}".encode()) artifact = ln.Artifact( model_file, key=f"{key_prefix}/model_{i}.ckpt", kind="model" ) artifact.save() artifact.features.add_values({"score": float(i), "model_rank": i}) created_artifacts.append(artifact) monkeypatch.setattr(FeatureManager, "remove_values", lambda *args, **kwargs: None) monkeypatch.setattr(FeatureManager, "add_values", lambda *args, **kwargs: None) with CaptureQueriesContext(connection) as ctx: callback._feature_annotator.update_model_ranks(key_prefix, mode="min") assert len(ctx.captured_queries) <= 6 for artifact in created_artifacts: artifact.delete(permanent=True, storage=True) def test_checkpoint_best_model_tracking( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Only one checkpoint should be marked as best model.""" callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=3, mode="min", ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) best_count = sum( 1 for af in artifacts if af.features.get_values().get("is_best_model") is True ) assert best_count == 1 def test_checkpoint_model_rank( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Checkpoints should have correct model_rank (0 = best).""" callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=3, mode="min", ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) ranks = [af.features.get_values().get("model_rank") for af in artifacts] assert 0 in ranks # best model has rank 0 last_count = sum( 1 for af in artifacts if af.features.get_values().get("is_last_model") is True ) assert last_count == 1 def test_checkpoint_last_model_points_to_last_saved_artifact( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """The artifact flagged as last model should be the last saved checkpoint artifact.""" checkpoint = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=3, mode="min", ) trainer = pl.Trainer( max_epochs=3, callbacks=[checkpoint], logger=False, ) trainer.fit(simple_model, dataloader) prefix = checkpoint.checkpoint_key_prefix + "/" artifacts = list(ln.Artifact.filter(key__startswith=prefix)) last_artifacts = [ artifact for artifact in artifacts if artifact.features.get_values().get("is_last_model") is True ] assert len(last_artifacts) == 1 assert checkpoint.last_checkpoint_artifact is not None assert last_artifacts[0].id == checkpoint.last_checkpoint_artifact.id def test_checkpoint_semantic_paths( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Checkpoints should have semantic keys derived from dirpath.""" callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_top_k=3, ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix + "/" artifacts = ln.Artifact.filter(key__startswith=prefix) assert len(artifacts) >= 1 for af in artifacts: assert af.key.startswith(prefix) values = af.features.get_values() assert "is_best_model" in values assert "score" in values def test_callback_deprecated( simple_model: pl.LightningModule, dataloader: DataLoader, tmp_path: Path, ): """Deprecated Callback should still work.""" key = f"test/legacy/{tmp_path.name}/model.ckpt" path = tmp_path / "model.ckpt" with pytest.warns(DeprecationWarning, match="use ll.Checkpoint instead"): callback = ll.Callback(path=path, key=key) trainer = pl.Trainer( max_epochs=1, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) artifacts = ln.Artifact.filter(key=key) assert len(artifacts) >= 1 assert artifacts[0].kind == "model" # cleanup for af in artifacts: af.delete(permanent=True, storage=True) def test_checkpoint_overwrites_existing_artifact( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ): """Checkpoint with same key should transparently replace the existing artifact.""" dummy = tmp_path / "dummy.ckpt" dummy.write_bytes(b"dummy") fixed_key = f"{dirpath.rstrip('/')}/fixed.ckpt" ln.Artifact(dummy, key=fixed_key).save() old_uid = ln.Artifact.filter(key=fixed_key).one().uid callback = ll.Checkpoint(dirpath=dirpath) monkeypatch.setattr(callback, "resolve_artifact_key", lambda **kwargs: fixed_key) trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False) trainer.fit(simple_model, dataloader) new_artifact = ln.Artifact.filter(key=fixed_key).one() assert new_artifact.uid != old_uid for af in ln.Artifact.filter(key=fixed_key): af.delete(permanent=True, storage=True) def test_checkpoint_invalid_feature_keys(dirpath: str): """Checkpoint should raise on invalid feature keys.""" with pytest.raises(ValueError, match="Invalid feature keys"): ll.Checkpoint( dirpath=dirpath, features={"invalid_key": {"foo": "bar"}}, # type: ignore ) def test_checkpoint_hparams(dataloader: DataLoader, dirpath: str, lightning_features): """Checkpoint should auto-capture model hparams if features exist.""" class ModelWithHparams(pl.LightningModule): def __init__(self, hidden_size: int = 32, learning_rate: float = 0.001): super().__init__() self.save_hyperparameters() self.layer = nn.Linear(10, hidden_size) self.out = nn.Linear(hidden_size, 1) def forward(self, x): return self.out(torch.relu(self.layer(x))) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) ln.Feature(name="hidden_size", dtype=int).save() ln.Feature(name="learning_rate", dtype=float).save() ln.track() model = ModelWithHparams(hidden_size=64, learning_rate=0.01) callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss") trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False) trainer.fit(model, dataloader) run = ln.context.run run_features = run.features.get_values() assert run_features["hidden_size"] == 64 assert run_features["learning_rate"] == 0.01 ln.finish() def test_checkpoint_datamodule_hparams( simple_model: pl.LightningModule, dirpath: str, lightning_features ): """Checkpoint should auto-capture datamodule hparams if features exist.""" class DataModuleWithHparams(pl.LightningDataModule): def __init__(self, batch_size: int = 32, num_workers: int = 4): super().__init__() self.save_hyperparameters() def train_dataloader(self): return DataLoader( TensorDataset(torch.randn(100, 10), torch.randn(100, 1)), batch_size=self.hparams.batch_size, ) ln.Feature(name="batch_size", dtype=int).save() ln.Feature(name="num_workers", dtype=int).save() ln.track() datamodule = DataModuleWithHparams(batch_size=16, num_workers=2) callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss") trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=False) trainer.fit(simple_model, datamodule=datamodule) run = ln.context.run run_features = run.features.get_values() assert run_features["batch_size"] == 16 assert run_features["num_workers"] == 2 ln.finish() def test_checkpoint_trainer_config( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, lightning_features: None, ): """Checkpoint should auto-capture trainer config if features exist.""" ln.track() callback = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", save_weights_only=True, mode="min", ) trainer = pl.Trainer( max_epochs=5, max_steps=100, precision="32", accumulate_grad_batches=2, gradient_clip_val=0.5, callbacks=[callback], logger=False, ) trainer.fit(simple_model, dataloader) run_features = ln.context.run.features.get_values() artifacts = ln.Artifact.filter(key__startswith=callback.checkpoint_key_prefix + "/") assert run_features["max_epochs"] == 5 assert run_features["max_steps"] == 100 assert run_features["precision"] == "32-true" assert run_features["accumulate_grad_batches"] == 2 assert run_features["gradient_clip_val"] == 0.5 assert run_features["monitor"] == "train_loss" assert run_features["mode"] == "min" assert "save_weights_only" not in run_features assert len(artifacts) >= 1 for artifact in artifacts: artifact_features = artifact.features.get_values() assert artifact_features["save_weights_only"] is True assert artifact_features["monitor"] == "train_loss" assert artifact_features["mode"] == "min" ln.finish() def test_checkpoint_hparams_yaml_with_hparams( dataloader: DataLoader, dirpath: str, tmp_path: Path, ): """Checkpoint should save hparams.yaml when model has hyperparameters.""" from lightning.pytorch.loggers import CSVLogger class ModelWithHparams(pl.LightningModule): def __init__(self, hidden_size: int = 32): super().__init__() self.save_hyperparameters() self.layer = nn.Linear(10, hidden_size) self.out = nn.Linear(hidden_size, 1) def forward(self, x): return self.out(torch.relu(self.layer(x))) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters()) logger = CSVLogger(save_dir=tmp_path, name="test_logs") model = ModelWithHparams(hidden_size=64) callback = ll.Checkpoint(dirpath=dirpath, monitor="train_loss") trainer = pl.Trainer( max_epochs=1, callbacks=[callback], logger=logger, ) trainer.fit(model, dataloader) resolved_dirpath = callback.checkpoint_key_prefix hparams_key = f"{resolved_dirpath}/hparams.yaml" hparams_artifact = ln.Artifact.filter(key=hparams_key).one_or_none() assert hparams_artifact is not None assert hparams_artifact.description == "Lightning run hyperparameters" # cleanup hparams_artifact.delete(permanent=True) shutil.rmtree(tmp_path / "test_logs", ignore_errors=True) @pytest.mark.parametrize( ("use_dirpath", "use_logger"), [ (True, True), (False, True), (True, False), (False, False), ], ids=[ "dirpath-logger", "no-dirpath-logger", "dirpath-no-logger", "no-dirpath-no-logger", ], ) def test_key_layout_matrix( simple_model: pl.LightningModule, dataloader: DataLoader, tmp_path: Path, use_dirpath: bool, use_logger: bool, ): """Artifact keys must follow the base-prefix layout across all 4 configurations. With ``run_uid_is_version=True`` and an active Lamin run, the expected key layout is:: {base}/checkpoints/{ckpt_filename} {base}/config.yaml (when SaveConfigCallback is used) {base}/checkpoints/hparams.yaml (when model has hyperparameters) Where ``base`` is determined by: ============================== ================================== Scenario Base prefix ============================== ================================== dirpath set (± logger) ``{dirpath}/{run_uid}`` no dirpath + logger ``{save_dir_name}/{name}/{run_uid}`` no dirpath + no logger ``{run_uid}`` ============================== ================================== """ from lightning.pytorch.loggers import CSVLogger class ParserStub: def save(self, config, path, skip_none, overwrite, multifile): del skip_none, overwrite, multifile Path(path).write_text(json.dumps(config, indent=2)) dirpath = str(tmp_path / "layout_test") ln.track() run_uid = ln.context.run.uid logger: CSVLogger | bool logger_name = "layout_exp" if use_logger: logger = CSVLogger(save_dir=tmp_path, name=logger_name) else: logger = False checkpoint = ll.Checkpoint( dirpath=dirpath if use_dirpath else None, monitor="train_loss", run_uid_is_version=True, ) config = {"trainer": {"max_epochs": 1}} save_config = ll.SaveConfigCallback( parser=cast(Any, ParserStub()), config=config, config_filename="config.yaml", ) trainer = pl.Trainer( max_epochs=1, callbacks=[checkpoint, save_config], logger=logger, default_root_dir=tmp_path, ) trainer.fit(simple_model, dataloader) # Determine expected base prefix if use_dirpath: expected_base = f"{dirpath.rstrip('/')}/{run_uid}" elif use_logger: expected_base = f"{tmp_path.name}/{logger_name}/{run_uid}" else: expected_base = run_uid # Verify base_prefix assert checkpoint.base_prefix == expected_base # Verify checkpoint key prefix expected_ckpt_prefix = f"{expected_base}/checkpoints" assert checkpoint.checkpoint_key_prefix == expected_ckpt_prefix # Verify checkpoint artifacts exist under the correct prefix ckpt_artifacts = ln.Artifact.filter(key__startswith=expected_ckpt_prefix + "/") assert len(ckpt_artifacts) >= 1 for af in ckpt_artifacts: assert af.key.startswith(expected_ckpt_prefix + "/") # Verify config artifact sits directly under the base prefix expected_config_key = f"{expected_base}/config.yaml" config_artifact = ln.Artifact.filter(key=expected_config_key).one_or_none() assert config_artifact is not None, f"Expected config at {expected_config_key}" # Cleanup json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact) ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete() config_artifact.delete(permanent=True, storage=True) json_values.delete(permanent=True) for af in ckpt_artifacts: af.delete(permanent=True, storage=True) ln.finish() if use_logger: shutil.rmtree(tmp_path / logger_name, ignore_errors=True) def test_run_uid_not_in_key_when_disabled( simple_model: pl.LightningModule, dataloader: DataLoader, tmp_path: Path, ): """With run_uid_is_version=False, the key should use the logger version as before.""" from lightning.pytorch.loggers import CSVLogger ln.track() logger = CSVLogger(save_dir=tmp_path, name="no_uid_test") callback = ll.Checkpoint(monitor="train_loss", run_uid_is_version=False) trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix assert "version_0" in prefix assert prefix == f"{tmp_path.name}/no_uid_test/version_0/checkpoints" artifacts = ln.Artifact.filter(key__startswith=prefix + "/") assert len(artifacts) >= 1 for af in artifacts: af.delete(permanent=True, storage=True) ln.finish() shutil.rmtree(tmp_path / "no_uid_test", ignore_errors=True) def test_two_runs_same_logger_produce_different_keys( simple_model: pl.LightningModule, dataloader: DataLoader, tmp_path: Path, ): """Two tracked runs with the same logger config should not collide on keys.""" from lightning.pytorch.loggers import CSVLogger prefixes = [] for _ in range(2): ln.track() logger = CSVLogger(save_dir=tmp_path, name="collision_test") callback = ll.Checkpoint(monitor="train_loss", run_uid_is_version=True) trainer = pl.Trainer(max_epochs=1, callbacks=[callback], logger=logger) trainer.fit(simple_model, dataloader) prefixes.append(callback.checkpoint_key_prefix) ln.finish() assert prefixes[0] != prefixes[1], "Two runs should produce different key prefixes" for prefix in prefixes: for af in ln.Artifact.filter(key__startswith=prefix + "/"): af.delete(permanent=True, storage=True) shutil.rmtree(tmp_path / "collision_test", ignore_errors=True) @pytest.mark.parametrize( ("use_dirpath", "logger_name", "key_source"), [ (False, "my_experiment", "logger"), (False, None, "checkpoints"), (True, "should_not_appear", "dirpath"), (True, None, "dirpath"), ], ids=[ "without-dirpath-with-logger", "without-dirpath-without-logger", "with-dirpath-with-logger", "with-dirpath-without-logger", ], ) def test_checkpoint_artifact_key_prefix_matrix( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, tmp_path: Path, use_dirpath: bool, logger_name: str | None, key_source: str, ): """Checkpoint artifact keys should match the dirpath/logger configuration matrix.""" from lightning.pytorch.loggers import CSVLogger logger: CSVLogger | bool if logger_name is None: logger = False else: logger = CSVLogger(save_dir=tmp_path, name=logger_name) callback = ll.Checkpoint( dirpath=dirpath if use_dirpath else None, monitor="train_loss", ) trainer = pl.Trainer( max_epochs=2, callbacks=[callback], logger=logger, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix if key_source == "logger": assert prefix == f"{tmp_path.name}/{logger_name}/version_0/checkpoints" elif key_source == "checkpoints": assert prefix == "checkpoints" else: assert prefix == f"{dirpath.rstrip('/')}/checkpoints" if logger_name is not None: assert logger_name not in prefix artifacts = ln.Artifact.filter(key__startswith=prefix + "/") assert len(artifacts) >= 1 for af in artifacts: assert af.kind == "model" assert af.key.startswith(prefix + "/") if not use_dirpath: for af in artifacts: af.delete(permanent=True, storage=True) if logger_name is not None: shutil.rmtree(tmp_path / logger_name, ignore_errors=True) def test_checkpoint_auto_features_without_dirpath( simple_model: pl.LightningModule, dataloader: DataLoader, tmp_path: Path, lightning_features: None, ): """Auto-features (best model, score, rank) should work without dirpath.""" from lightning.pytorch.loggers import CSVLogger logger = CSVLogger(save_dir=tmp_path, name="auto_feat") callback = ll.Checkpoint( monitor="train_loss", save_top_k=2, mode="min", ) trainer = pl.Trainer( max_epochs=3, callbacks=[callback], logger=logger, ) trainer.fit(simple_model, dataloader) prefix = callback.checkpoint_key_prefix artifacts = ln.Artifact.filter(key__startswith=prefix + "/") assert len(artifacts) >= 1 for af in artifacts: values = af.features.get_values() assert "is_best_model" in values assert "score" in values assert "model_rank" in values best_count = sum( 1 for af in artifacts if af.features.get_values().get("is_best_model") is True ) assert best_count == 1 ranks = [af.features.get_values().get("model_rank") for af in artifacts] assert 0 in ranks # cleanup for af in artifacts: af.delete(permanent=True, storage=True) shutil.rmtree(tmp_path / "auto_feat", ignore_errors=True) @pytest.mark.parametrize( ("use_dirpath", "logger_name", "key_source"), [ (False, "cli_logs", "logger"), (False, None, "filename"), (True, "cli_logs", "dirpath"), (True, None, "dirpath"), ], ids=[ "without-dirpath-with-logger", "without-dirpath-without-logger", "with-dirpath-with-logger", "with-dirpath-without-logger", ], ) def test_save_config_artifact_key_matrix( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, tmp_path: Path, use_dirpath: bool, logger_name: str | None, key_source: str, ): """Config artifacts should be stored under the base prefix (dirpath > logger > empty).""" from lightning.pytorch.loggers import CSVLogger class ParserStub: def save( self, config, path, skip_none: bool, overwrite: bool, multifile: bool, ) -> None: del skip_none, overwrite, multifile Path(path).write_text(json.dumps(config, indent=2)) logger: CSVLogger | bool if logger_name is None: logger = False else: logger = CSVLogger(save_dir=tmp_path, name=logger_name) checkpoint = ll.Checkpoint( dirpath=dirpath if use_dirpath else None, monitor="train_loss", ) config = {"trainer": {"max_epochs": 1}, "model": {"hidden_size": 1}} save_config = ll.SaveConfigCallback( parser=cast(Any, ParserStub()), config=config, config_filename="config.yaml", ) trainer = pl.Trainer( max_epochs=1, callbacks=[checkpoint, save_config], logger=logger, default_root_dir=tmp_path, ) trainer.fit(simple_model, dataloader) assert trainer.log_dir is not None local_config_path = Path(trainer.log_dir) / "config.yaml" assert local_config_path.exists() assert "max_epochs" in local_config_path.read_text() if use_dirpath: assert dirpath.rstrip("/") not in str(local_config_path) if key_source == "logger": assert logger_name is not None config_key = f"{tmp_path.name}/{logger_name}/version_0/config.yaml" elif key_source == "dirpath": config_key = f"{dirpath.rstrip('/')}/config.yaml" else: config_key = "config.yaml" config_artifact = ln.Artifact.filter(key=config_key).one_or_none() assert config_artifact is not None assert config_artifact.description == "Lightning CLI config" checkpoint_artifacts = ln.Artifact.filter( key__startswith=checkpoint.checkpoint_key_prefix + "/" ) assert len(checkpoint_artifacts) >= 1 json_values = ln.models.JsonValue.filter(links_artifact__artifact=config_artifact) ln.models.ArtifactJsonValue.filter(artifact=config_artifact).delete() config_artifact.delete(permanent=True, storage=True) json_values.delete(permanent=True) for artifact in checkpoint_artifacts: artifact.delete(permanent=True, storage=True) shutil.rmtree(tmp_path / "cli_logs", ignore_errors=True) def test_save_config_artifact_tracked_as_run_input( simple_model: pl.LightningModule, dataloader: DataLoader, dirpath: str, tmp_path: Path, ): """Config artifacts should be tracked as run inputs while checkpoints stay outputs.""" class ParserStub: def save( self, config, path, skip_none: bool, overwrite: bool, multifile: bool, ) -> None: del skip_none, overwrite, multifile Path(path).write_text(json.dumps(config, indent=2)) ln.track() checkpoint = ll.Checkpoint(dirpath=dirpath, monitor="train_loss") save_config = ll.SaveConfigCallback( parser=cast(Any, ParserStub()), config={"trainer": {"max_epochs": 1}}, config_filename="config.yaml", ) trainer = pl.Trainer( max_epochs=1, callbacks=[checkpoint, save_config], logger=False, default_root_dir=tmp_path, ) trainer.fit(simple_model, dataloader) run = ln.context.run assert run is not None assert checkpoint.last_config_artifact is not None assert checkpoint.last_checkpoint_artifact is not None config_artifact = checkpoint.last_config_artifact checkpoint_artifact = checkpoint.last_checkpoint_artifact assert config_artifact.run is None assert run in config_artifact.input_of_runs.all() assert checkpoint_artifact.run == run assert checkpoint_artifact.input_of_runs.count() == 0 config_artifact.delete(permanent=True, storage=True) checkpoint_artifact.delete(permanent=True, storage=True) ln.finish() def test_checkpoint_subclass_receives_artifact_events( dataloader: DataLoader, dirpath: str, tmp_path: Path, ): """Subclass hooks should receive checkpoint, config, and hparams artifacts.""" from lightning.pytorch.loggers import CSVLogger class ModelWithHparams(pl.LightningModule): def __init__(self, hidden_size: int = 32): super().__init__() self.save_hyperparameters() self.layer = nn.Linear(10, hidden_size) self.out = nn.Linear(hidden_size, 1) def forward(self, x): return self.out(torch.relu(self.layer(x))) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters()) class ParserStub: def save( self, config, path, skip_none: bool, overwrite: bool, multifile: bool, ) -> None: del skip_none, overwrite, multifile Path(path).write_text(json.dumps(config, indent=2)) class RecordingCheckpoint(ll.Checkpoint): def __init__(self, **kwargs): super().__init__(**kwargs) self.saved_events: list[ll.ArtifactSavedEvent] = [] def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None: self.saved_events.append(event) logger = CSVLogger(save_dir=tmp_path, name="recording_logs") checkpoint = RecordingCheckpoint(dirpath=dirpath, monitor="train_loss") save_config = ll.SaveConfigCallback( parser=cast(Any, ParserStub()), config={"trainer": {"max_epochs": 1}}, config_filename="config.yaml", ) trainer = pl.Trainer( max_epochs=1, callbacks=[checkpoint, save_config], logger=logger, default_root_dir=tmp_path, ) trainer.fit(ModelWithHparams(), dataloader) assert {event.kind for event in checkpoint.saved_events} >= { "checkpoint", "config", "hparams", } assert checkpoint.last_checkpoint_artifact is not None assert checkpoint.last_config_artifact is not None assert checkpoint.last_hparams_artifact is not None assert checkpoint.last_checkpoint_artifact.key.startswith( checkpoint.checkpoint_key_prefix + "/" ) assert checkpoint.last_config_artifact.key.endswith("/config.yaml") assert checkpoint.last_hparams_artifact.key == ( f"{checkpoint.checkpoint_key_prefix}/hparams.yaml" ) checkpoint_event = next( event for event in checkpoint.saved_events if event.kind == "checkpoint" ) assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + "/") assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri( checkpoint_event.artifact ) assert checkpoint_event.storage_uri.endswith(".ckpt") artifacts_by_key = {event.key: event.artifact for event in checkpoint.saved_events} for artifact in artifacts_by_key.values(): ln.models.ArtifactJsonValue.filter(artifact=artifact).delete() ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete( permanent=True ) artifact.delete(permanent=True, storage=True) shutil.rmtree(tmp_path / "recording_logs", ignore_errors=True) def test_checkpoint_artifact_observers_receive_shared_events( dataloader: DataLoader, dirpath: str, tmp_path: Path, ): """Observers should see the same checkpoint/config/hparams events as subclasses.""" from lightning.pytorch.loggers import CSVLogger class ModelWithHparams(pl.LightningModule): def __init__(self, hidden_size: int = 32): super().__init__() self.save_hyperparameters() self.layer = nn.Linear(10, hidden_size) self.out = nn.Linear(hidden_size, 1) def forward(self, x): return self.out(torch.relu(self.layer(x))) def training_step(self, batch, batch_idx): x, y = batch loss = nn.functional.mse_loss(self(x), y) self.log("train_loss", loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters()) class ParserStub: def save( self, config, path, skip_none: bool, overwrite: bool, multifile: bool, ) -> None: del skip_none, overwrite, multifile Path(path).write_text(json.dumps(config, indent=2)) class RecordingObserver: def __init__(self): self.saved_events: list[ll.ArtifactSavedEvent] = [] def on_artifact_saved(self, event: ll.ArtifactSavedEvent) -> None: self.saved_events.append(event) def on_artifact_removed(self, event: ll.ArtifactRemovedEvent) -> None: del event observer = RecordingObserver() logger = CSVLogger(save_dir=tmp_path, name="observer_logs") checkpoint = ll.Checkpoint( dirpath=dirpath, monitor="train_loss", artifact_observers=[observer], ) save_config = ll.SaveConfigCallback( parser=cast(Any, ParserStub()), config={"trainer": {"max_epochs": 1}}, config_filename="config.yaml", ) trainer = pl.Trainer( max_epochs=1, callbacks=[checkpoint, save_config], logger=logger, default_root_dir=tmp_path, ) trainer.fit(ModelWithHparams(), dataloader) assert {event.kind for event in observer.saved_events} >= { "checkpoint", "config", "hparams", } checkpoint_event = next( event for event in observer.saved_events if event.kind == "checkpoint" ) assert checkpoint_event.key.startswith(checkpoint.checkpoint_key_prefix + "/") assert checkpoint_event.local_path.name.endswith(".ckpt") assert checkpoint_event.storage_uri == checkpoint.resolve_artifact_storage_uri( checkpoint_event.artifact ) assert checkpoint.last_artifact_event is not None assert checkpoint.get_last_artifact("config") == checkpoint.last_config_artifact artifacts_by_key = {event.key: event.artifact for event in observer.saved_events} for artifact in artifacts_by_key.values(): ln.models.ArtifactJsonValue.filter(artifact=artifact).delete() ln.models.JsonValue.filter(links_artifact__artifact=artifact).delete( permanent=True ) artifact.delete(permanent=True, storage=True) shutil.rmtree(tmp_path / "observer_logs", ignore_errors=True) ================================================ FILE: tests/no_instance/conftest.py ================================================ import pytest @pytest.fixture def ccaplog(caplog) -> pytest.LogCaptureFixture: """Add caplog handler to our custom logger at session start.""" from lamin_utils._logger import logger logger.addHandler(caplog.handler) yield caplog logger.removeHandler(caplog.handler) ================================================ FILE: tests/no_instance/test_connect_dynamic_import.py ================================================ def test_connect_dynamic_import(ccaplog): import lamindb as ln # this only currently works if not instance was configured in the environment # in all other cases, we still trigger a reset_django() and hence django variables # become stale in case of a dynamic import assert ln.setup.settings.instance.slug == "none/none" ln.connect("laminlabs/lamin-site-assets") assert "connected in read-only mode" in ccaplog.text assert ln.Artifact.filter(key__startswith="blog").count() > 0 ln.setup.disconnect() ================================================ FILE: tests/no_instance/test_import_side_effects.py ================================================ import importlib.util import json import os import subprocess import sys from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parents[2] MODULE_NAMES = ("anndata", "h5py", "pyarrow") LIGHT_IMPORTS = {name: False for name in MODULE_NAMES} PROBE_CASES = [ ( "storage package constants stay light", "import lamindb.core.storage as storage\n_ = storage.VALID_SUFFIXES\n_ = storage.delete_storage\n_ = storage.infer_filesystem", LIGHT_IMPORTS, (), ), ( "storage object helpers stay light", "import lamindb.core.storage as storage\n_ = storage.infer_suffix\n_ = storage.write_to_disk", LIGHT_IMPORTS, (), ), ( "loaders basic helpers stay light", "import lamindb.core.loaders as loaders\n_ = loaders.load_json\n_ = loaders.load_txt\n_ = loaders.load_html", LIGHT_IMPORTS, (), ), ( "loaders tabular helpers stay light", "import lamindb.core.loaders as loaders\n_ = loaders.load_csv\n_ = loaders.load_parquet\n_ = loaders.load_tsv", LIGHT_IMPORTS, (), ), ( "loaders optional-format helpers stay light", "import lamindb.core.loaders as loaders\n_ = loaders.load_h5ad\n_ = loaders.load_h5mu\n_ = loaders.load_zarr", LIGHT_IMPORTS, (), ), ( "backed_access symbols stay light", "from lamindb.core.storage._backed_access import BackedAccessor, backed_access, _open_dataframe\n_ = BackedAccessor\n_ = backed_access\n_ = _open_dataframe", LIGHT_IMPORTS, (), ), ( "objects module import stays light", "from lamindb.core.storage.objects import infer_suffix, write_to_disk\n_ = infer_suffix\n_ = write_to_disk", LIGHT_IMPORTS, (), ), ( "backed_access pyarrow dataframe path stays anndata-free", "from upath import UPath\nimport pyarrow as pa\nimport pyarrow.parquet as pq\nfrom lamindb.core.storage._backed_access import backed_access\npath = UPath('test_import_side_effects.parquet')\npq.write_table(pa.table({'col': [1]}), path.as_posix())\ntry:\n _ = backed_access(path, engine='pyarrow')\nfinally:\n if path.exists():\n path.unlink()", {"anndata": False, "h5py": False, "pyarrow": True}, ("pyarrow",), ), ( "backed_access polars dataframe path stays light", "from upath import UPath\nfrom lamindb.core.storage._backed_access import backed_access\npath = UPath('test_import_side_effects.csv')\nwith path.open('w') as f:\n _ = f.write('col\\n1\\n')\ntry:\n _ = backed_access(path, engine='polars')\nfinally:\n if path.exists():\n path.unlink()", LIGHT_IMPORTS, ("polars",), ), ] def _probe_modules_loaded(code: str) -> dict[str, bool]: env = os.environ.copy() pythonpath = env.get("PYTHONPATH") env["PYTHONPATH"] = ( str(REPO_ROOT) if not pythonpath else os.pathsep.join([str(REPO_ROOT), pythonpath]) ) probe_lines = [ "import json", "import sys", "", f"module_names = {MODULE_NAMES!r}", "result = {name: (name in sys.modules) for name in module_names}", code, 'result.update({f"{name}_after": (name in sys.modules) for name in module_names})', "print(json.dumps(result))", ] probe = "\n".join(probe_lines) completed = subprocess.run( [sys.executable, "-c", probe], check=True, capture_output=True, cwd=REPO_ROOT, env=env, text=True, ) stdout_lines = [line for line in completed.stdout.splitlines() if line.strip()] return json.loads(stdout_lines[-1]) def _assert_modules( result: dict[str, bool], expected_after: dict[str, bool], label: str ): for module_name in MODULE_NAMES: assert result[module_name] is False, ( f"{label}: {module_name} loaded before probe" ) assert result[f"{module_name}_after"] is expected_after[module_name], ( f"{label}: unexpected {module_name} import state" ) @pytest.mark.parametrize( ("label", "code", "expected_after", "required_modules"), PROBE_CASES, ) def test_storage_import_side_effects( label: str, code: str, expected_after: dict[str, bool], required_modules: tuple[str, ...], ): missing_modules = [ module_name for module_name in required_modules if importlib.util.find_spec(module_name) is None ] if missing_modules: pytest.skip(f"missing optional dependency: {', '.join(missing_modules)}") result = _probe_modules_loaded(code) _assert_modules(result, expected_after, label) ================================================ FILE: tests/no_instance/test_no_default_instance.py ================================================ import lamindb as ln import pandas as pd import pytest from lamindb_setup.errors import CurrentInstanceNotConfigured def test_no_read_only_warning(ccaplog): ln.Artifact.connect("laminlabs/lamindata") ln.DB("laminlabs/lamindata") assert "connected in read-only mode" not in ccaplog.text def test_instance_not_connected(): assert ln.setup.settings.instance.slug == "none/none" with pytest.raises(CurrentInstanceNotConfigured): ln.Artifact.filter().count() def test_query_artifacts_lamindata(): artifacts = ln.Artifact.connect("laminlabs/lamindata") n_artifacts = artifacts.count() assert n_artifacts > 0 assert n_artifacts > artifacts.filter().count() def test_get_artifact_lamindata(): artifact = ln.Artifact.connect("laminlabs/lamindata").get( key="example_datasets/small_dataset1.parquet" ) assert isinstance(artifact.load(), pd.DataFrame) ================================================ FILE: tests/permissions/conftest.py ================================================ import shutil from subprocess import DEVNULL, run from time import perf_counter import lamindb_setup as ln_setup import pytest from lamin_utils import logger def pytest_sessionstart(): t_execute_start = perf_counter() # these are called in separate scripts because can't change connection # within the same python process due to django # init instance and setup RLS run( # noqa: S602 "python ./tests/permissions/scripts/setup_instance.py", shell=True, capture_output=False, ) # populate permissions and models via the admin connection run( # noqa: S602 "python ./tests/permissions/scripts/setup_access.py", shell=True, capture_output=False, ) total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") def pytest_sessionfinish(session: pytest.Session): logger.set_verbosity(1) shutil.rmtree("./default_storage_permissions") ln_setup.delete("lamindb-test-permissions", force=True) run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL) # noqa: S602 ================================================ FILE: tests/permissions/jwt_utils.py ================================================ import json import psycopg2 def sign_jwt(db_url, payload: dict) -> str: with psycopg2.connect(db_url) as conn, conn.cursor() as cur: cur.execute( """ SELECT sign( %s::json, (SELECT security.get_secret('jwt_secret')), %s ) """, (json.dumps(payload), "HS256"), ) token = cur.fetchone()[0] if not token: msg = "Failed to generate JWT" raise ValueError(msg) return token ================================================ FILE: tests/permissions/scripts/check_lamin_dev.py ================================================ import subprocess from unittest.mock import patch import lamindb as ln import pytest from lamindb_setup.core._hub_core import select_space, select_storage def cleanup(records): for record in records: try: if isinstance(record, ln.Storage): record.artifacts.all().delete(permanent=True) record.delete(permanent=True) except Exception as e: print(f"Failed deleting {record}: {e}") assert ln.setup.settings.user.handle == "testuser1" ln.connect("laminlabs/lamin-dev") assert ln.setup.settings.instance.slug == "laminlabs/lamin-dev" # check that the rename resolves correctly (it was renamed) assert ln.Artifact.connect("laminlabs/lamin-dev1072025").db == "default" space_name = "Our test space for CI" space = ln.Space.get(name=space_name) # check that we throw an error if no storage location is managed by the space storage_loc = ln.Storage.filter(space=space).one_or_none() if storage_loc is not None: ln.Run.filter(report__storage=storage_loc).delete(permanent=True) storage_loc.artifacts.all().delete(permanent=True) storage_loc.delete(permanent=True) with pytest.raises(ln.errors.NoStorageLocationForSpace) as error: ln.track(space=space_name) # this fails to save the env artifact ln.context._transform = None ln.context._run = None # now create the storage location in the space storage_loc = ln.Storage("create-s3", space=space).save() ln.track(space=space_name) try: assert ln.context.space.name == space_name ulabel = ln.ULabel(name="My test ulabel in test space").save() # cleanup if the artifact already exists artifact = ln.Artifact(".gitignore", key="mytest") if ( artifact_cleanup := ln.Artifact.filter(hash=artifact.hash).one_or_none() ) is not None: artifact_cleanup.delete(permanent=True) # cleanup if the directory artifact already exists artifact_dir = ln.Artifact("./scripts", key="mytest-dir") if ( artifact_cleanup := ln.Artifact.filter(hash=artifact_dir.hash).one_or_none() ) is not None: artifact_cleanup.delete(permanent=True) artifact = ln.Artifact(".gitignore", key="mytest").save() artifact_dir = ln.Artifact("./scripts", key="mytest-dir").save() # check that exist ln.ULabel.get(name="My test ulabel in test space") ln.Artifact.get(key="mytest") ln.Artifact.get(key="mytest-dir") assert ulabel.space == space # ulabel should end up in the restricted space assert artifact.space == space # the below check doesn't work: another worker might have associated another storage location with the space, and then the artifact ends up in that # assert artifact.storage == storage_loc # hence this check assert artifact.storage in ln.Storage.filter(space=space) assert ln.context.transform.space == space assert ln.context.run.space == space # move the artifact to another storage location space_test_move = ln.Space.get(name="test-move") original_path = artifact.path artifact.space = space_test_move # cancel save with patch("builtins.input", return_value="x"): artifact.save() # save to the new storage location with patch("builtins.input", return_value="1"): artifact.save() assert artifact.space == space_test_move assert artifact.storage in ln.Storage.filter(space=space_test_move) assert not original_path.exists() assert artifact.path.as_posix().startswith(artifact.storage.root) assert artifact.path.exists() # move the directory artifact to another storage location assert artifact_dir.space == space assert artifact_dir.path.is_dir() assert artifact_dir.storage in ln.Storage.filter(space=space) original_path_dir = artifact_dir.path artifact_dir.space = space_test_move # save to the new storage location with patch("builtins.input", return_value="0"): artifact_dir.save() assert artifact_dir.space == space_test_move assert artifact_dir.storage in ln.Storage.filter(space=space_test_move) original_path_dir.fs.invalidate_cache() assert not original_path_dir.exists() assert artifact_dir.path.as_posix().startswith(artifact_dir.storage.root) assert artifact_dir.path.is_dir() # update the space of the storage location space2 = ln.Space.get(name="Our test space for CI 2") storage_loc.space = space2 storage_loc.save() response_storage = select_storage(lnid=storage_loc.uid) response_space = select_space(lnid=space2.uid) assert response_storage["space_id"] == response_space["id"] # connect to the instance before saving subprocess.run( # noqa: S602 "lamin connect laminlabs/lamin-dev", shell=True, check=True, ) result = subprocess.run( # noqa: S602 "lamin save .gitignore --key mytest --space 'Our test space for CI 2'", shell=True, capture_output=True, ) assert "key='mytest'" in result.stdout.decode() assert "storage path:" in result.stdout.decode() assert result.returncode == 0 finally: try: storage_loc.run = None storage_loc.save() except: # noqa pass cleanup( ( ulabel, artifact, artifact_dir, ln.context.transform.latest_run, ln.context.transform, storage_loc, ) ) ================================================ FILE: tests/permissions/scripts/setup_access.py ================================================ import lamindb as ln # noqa import hubmodule import hubmodule.models as hm from uuid import uuid4 from hubmodule.dev.migrate.deploy import _apply_migrations_with_tracking from hubmodule.dev.setup.install import ( _setup_extensions, _setup_secret, _setup_utils_db_modules, ) from hubmodule.sql_generators._rls import RLSGenerator from hubmodule.sql_generators._dbwrite import install_dbwrite from laminhub_instancedb.postgres import DbRoleHandler from pathlib import Path # create a db connection url that works with RLS instance_id = ln.setup.settings.instance._id def create_jwt_user(dsn_admin: str, jwt_role_name: str): db_role_handler = DbRoleHandler(dsn_admin) jwt_db_url = db_role_handler.create( jwt_role_name, expires_in=None, alter_if_exists=True ) db_role_handler.permission.grant_write_jwt(jwt_role_name) return jwt_db_url pgurl = "postgresql://postgres:pwd@0.0.0.0:5432/pgtest" # admin db connection url jwt_role_name = f"{instance_id.hex}_jwt" jwt_db_url = create_jwt_user(pgurl, jwt_role_name=jwt_role_name) _setup_extensions(pgurl) _setup_secret(pgurl) _setup_utils_db_modules(pgurl) migrations_sql_dir = Path(hubmodule.__file__).parent / "sql/0004_migrations" _apply_migrations_with_tracking(pgurl, migrations_sql_dir) rls_generator = RLSGenerator(pgurl, jwt_role_name=jwt_role_name, public_role_name=None) for i, table in enumerate(rls_generator._list_tables()): print(i, table.table_name, table.foreign_keys, table.has_space_id) rls_generator.setup() print("Created jwt db connection") install_dbwrite(pgurl) print("Installed dbwrite") # create models full_access = ln.Space(name="full access").save() # type: ignore select_access = ln.Space(name="select access").save() # type: ignore no_access = ln.Space(name="no access").save() # type: ignore # set read role for the default space usettings = ln.setup.settings.user account = hm.Account(id=usettings._uuid.hex, uid=usettings.uid, role="read").save() # create a test user object ln.User(uid="testuid1", handle="testuser", name="Test User").save() # no access space ulabel = ln.ULabel(name="no_access_ulabel") ulabel.space = no_access ulabel.save() # set up access to this individual record with a dummy role, # will work only after the role is changed to read, write or admin hm.AccessRecord( account=account, record_type="lamindb_ulabel", record_id=ulabel.id, role="dummy" ).save() project = ln.Project(name="No_access_project") # type: ignore project.space = no_access project.save() hm.AccessRecord( account=account, record_type="lamindb_project", record_id=project.id, role="dummy" ).save() # setup write access space hm.AccessSpace(account=account, space=full_access, role="write").save() ulabel = ln.ULabel(name="full_access_ulabel") ulabel.space = full_access ulabel.save() # setup read access space hm.AccessSpace(account=account, space=select_access, role="read").save() ulabel = ln.ULabel(name="select_ulabel") ulabel.space = select_access ulabel.save() # artificial but better to test # create a link table referencing rows in different spaces ulabel.projects.add(project) # default space, only select access by default ulabel = ln.ULabel(name="default_space_ulabel").save() ulabel.projects.add(project) project = ln.Project(name="default_space_project").save() ulabel.projects.add(project) # create a link table referencing ulabel from the default space and project from select space project = ln.Project(name="select_project") project.space = select_access project.save() ulabel.projects.add(project) # setup team and relevent models team_access = ln.Space(name="team access").save() # type: ignore team = hm.Team(id=uuid4().hex, uid="teamuiduid11", name="test_team", role="read").save() hm.AccountTeam(account=account, team=team).save() hm.AccessSpace(team=team, space=team_access, role="read").save() feature = ln.Feature(name="team_access_feature", dtype=float) feature.space = team_access feature.save() # artifact for testing tracking error and artifactblock artifact = ln.Artifact("README.md", description="test tracking error") artifact.space = select_access artifact.save() # artifact for testing tracking error and locking artifact = ln.Artifact(".gitignore", description="test locking") artifact.space = full_access artifact.is_locked = True artifact.save() # create a single record in the default space record = ln.Record(name="test-record", is_type=False).save() assert record.space_id == 1 print("Created models") # save jwt db connection ln.setup.settings.instance._db = jwt_db_url ln.setup.settings.instance._persist() ================================================ FILE: tests/permissions/scripts/setup_instance.py ================================================ import lamindb_setup as ln_setup from laminci.db import setup_local_test_postgres pgurl = setup_local_test_postgres() ln_setup.init( storage="./default_storage_permissions", name="lamindb-test-permissions", db=pgurl, ) # can't add this app in the init because don't want t trigger the initial migration # that conflicts with _install_db_module ln_setup.settings.instance._schema_str = "hubmodule" ln_setup.settings.instance._persist() ================================================ FILE: tests/permissions/test_rls_dbwritelog.py ================================================ import subprocess import time from pathlib import Path from uuid import uuid4 import hubmodule.models as hm import lamindb as ln import psycopg2 import pytest from django.db import connection, transaction from django.db.utils import IntegrityError, InternalError, ProgrammingError from hubmodule.sql_generators._dbwrite import uninstall_dbwrite from jwt_utils import sign_jwt from lamindb.models.artifact import track_run_input from lamindb_setup.core.django import DBToken, db_token_manager from psycopg2.extensions import adapt pgurl = "postgresql://postgres:pwd@0.0.0.0:5432/pgtest" # admin db connection url user_uuid = ln.setup.settings.user._uuid.hex expiration = time.time() + 2000 # full collaborator token token = sign_jwt( pgurl, {"account_id": user_uuid, "exp": expiration, "type": "collaborator"} ) # read-only token token_read = sign_jwt( pgurl, {"account_id": user_uuid, "exp": expiration, "type": "read-only"} ) # init an instance of DBToken manually db_token = DBToken({}) db_token._token = token db_token._token_query = f"SELECT set_token({adapt(token).getquoted().decode()}, true);" db_token._expiration = expiration db_token_manager.set(db_token) def test_token_expiration(): # init connection.connection with connection.cursor() as cur: pass expired_token = sign_jwt( pgurl, {"account_id": user_uuid, "exp": time.time() - 1000, "type": "collaborator"}, ) # check that an expired token is invalid with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT set_token(%s);", (expired_token,)) def test_authentication(): # just check that the token was setup with connection.cursor() as cur: cur.execute( "SELECT 1 in (SELECT id FROM public.check_access() WHERE role = 'read');" ) result = cur.fetchall()[0][0] assert result # check querying without setting jwt with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_ulabel;") # test that auth can't be hijacked # false table created before with ( pytest.raises(psycopg2.errors.DuplicateTable), connection.connection.cursor() as cur, ): cur.execute( """ CREATE TEMP TABLE access( id int, role varchar(20), type text ) ON COMMIT DROP; SELECT set_token(%s); """, (token,), ) # check that jwt user can't set arbitrary account_id manually with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute( """ CREATE TEMP TABLE access( id int, role varchar(20), type text ) ON COMMIT DROP; INSERT INTO access (id, role, type) VALUES (1, 'admin', 'space'); SELECT * FROM check_access(); """ ) # check manual insert with ( pytest.raises(psycopg2.errors.InsufficientPrivilege), connection.connection.cursor() as cur, ): cur.execute( """ SELECT set_token(%s); INSERT INTO access (id, role, type) VALUES (1, 'admin', 'space'); """, (token,), ) # test access to the security schema with ( pytest.raises(psycopg2.errors.InsufficientPrivilege), connection.connection.cursor() as cur, ): cur.execute("SELECT security.get_secret('jwt_secret');") # test read-only token with connection.connection.cursor() as cur: cur.execute("SELECT set_token(%s); SELECT * FROM check_access()", (token_read,)) result = cur.fetchall() assert len(result) == 1 assert result[0] == (1, "read", "space") assert ln.base.users._user_has_write_access() def test_select_without_db_token(): # with db token can be read in the default space with connection.cursor() as cur: cur.execute("SELECT * FROM lamindb_record;") results = cur.fetchall() assert len(results) == 1 # the same assert ln.Record.filter().count() == 1 # errors if can't select ln.Record.get(1) # no db token, everything in the default space with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_record;") with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_record WHERE id = 1;") # no db token, in different spaces with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_artifact;") with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_ulabel;") # no db token, utility tables with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_user;") with ( pytest.raises(psycopg2.errors.RaiseException), connection.connection.cursor() as cur, ): cur.execute("SELECT * FROM lamindb_space;") def test_fine_grained_permissions_account_and_dbwrite(): # check select assert ln.ULabel.filter().count() == 3 assert ln.Project.filter().count() == 2 ulabel = ln.ULabel.get(name="default_space_ulabel") assert ulabel.projects.all().count() == 2 # check delete # should delete ulabel_del = ln.ULabel.get(name="full_access_ulabel") ulabel_del_id = ulabel_del.id ulabel_del.delete(permanent=True) assert ln.ULabel.filter().count() == 2 # check the logs for delete log_rec = ( hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name="lamindb_ulabel") .order_by("-id") .first() ) assert log_rec.event_type == "DELETE" assert log_rec.data is not None assert log_rec.created_by_id == 1 # check the logs for insert log_rec = ( hm.DbWrite.filter(sqlrecord_id=ulabel_del_id, table_name="lamindb_ulabel") .order_by("id") .first() ) assert log_rec.event_type == "INSERT" assert log_rec.data is None assert log_rec.created_by_id is None # this was inserted without setting a db token # should not delete, does not error for some reason ln.ULabel.get(name="select_ulabel").delete(permanent=True) assert ln.ULabel.filter().count() == 2 # default space ulabel.delete(permanent=True) assert ln.ULabel.filter().count() == 2 # check insert # should succeed space = ln.Space.get(name="full access") ulabel = ln.ULabel(name="new label") ulabel.space = space ulabel.save() # should fail with pytest.raises(ln.errors.NoWriteAccess): ln.ULabel(name="new label fail").save() for space_name in ["select access", "no access"]: space = ln.Space.get(name=space_name) ulabel = ln.ULabel(name="new label fail") ulabel.space = space with pytest.raises(ln.errors.NoWriteAccess): ulabel.save() # check update # should succeed ulabel = ln.ULabel.get(name="new label") ulabel.name = "new label update" ulabel.save() ulabel = ln.ULabel.get(name="new label update") # check that it is saved # check the logs for update log_rec = ( hm.DbWrite.filter(sqlrecord_id=ulabel.id, table_name="lamindb_ulabel") .order_by("-id") .first() ) assert log_rec.event_type == "UPDATE" assert log_rec.data["name"] == "new label" # changed assert "id" not in log_rec.data # didn't change assert log_rec.created_by_id == 1 # should fail ulabel = ln.ULabel.get(name="select_ulabel") ulabel.name = "select_ulabel update" with pytest.raises(ln.errors.NoWriteAccess): ulabel.save() # default space ulabel = ln.ULabel.get(name="default_space_ulabel") ulabel.name = "default_space_ulabel update" with pytest.raises(ln.errors.NoWriteAccess): ulabel.save() # check link tables # check insert project = ln.Project(name="Myproject") project.space = ln.Space.get(name="full access") project.save() ulabel = ln.ULabel.get(name="new label update") ulabel.projects.add(project) assert ulabel.projects.all().count() == 1 # check select of a link table referencing unavailable rows assert ln.ULabel.get(name="select_ulabel").projects.all().count() == 0 # test SpaceBlock space = ln.Space.get(name="select access") with pytest.raises(ln.errors.NoWriteAccess): ln.models.SpaceBlock(space=space, content="test", kind="readme").save() # test ArtifactBlock, artifact is read-only artifact = ln.Artifact.get(description="test tracking error") with pytest.raises(ProgrammingError): ln.models.ArtifactBlock(artifact=artifact, content="test", kind="readme").save() # test BranchBlock, the account is read-only branch = ln.Branch.get(1) # main branch in all space with pytest.raises(ProgrammingError): ln.models.BranchBlock(branch=branch, content="test", kind="readme").save() def test_fine_grained_permissions_team(): assert ln.Feature.filter().count() == 1 ln.Feature.get(name="team_access_feature") def test_fine_grained_permissions_single_records(): assert not ln.ULabel.filter(name="no_access_ulabel").exists() assert not ln.Project.filter(name="No_access_project").exists() # check that the logs are not available for the ulabel with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute("SELECT id FROM lamindb_ulabel WHERE name = 'no_access_ulabel'") ulabel_id = cur.fetchone()[0] assert not hm.DbWrite.filter( sqlrecord_id=ulabel_id, table_name="lamindb_ulabel" ).exists() # switch access to this ulabel to read with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( """ UPDATE hubmodule_accessrecord SET role = 'read' WHERE account_id = %s AND record_type = 'lamindb_ulabel' """, (user_uuid,), ) ulabel = ln.ULabel.get(name="no_access_ulabel") # check that the logs are available now assert hm.DbWrite.filter( sqlrecord_id=ulabel.id, table_name="lamindb_ulabel" ).exists() new_name = "new_name_single_rls_access_ulabel" ulabel.name = new_name with pytest.raises(ln.errors.NoWriteAccess): ulabel.save() # switch access for the project to read with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( """ UPDATE hubmodule_accessrecord SET role = 'read' WHERE account_id = %s AND record_type = 'lamindb_project' """, (user_uuid,), ) # now the project is readable project = ln.Project.get(name="No_access_project") # can't insert into lamindb_ulabelproject because the ulabel is read-only with pytest.raises(ProgrammingError): ulabel.projects.add(project) # switch access for the ulabel to write with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( """ UPDATE hubmodule_accessrecord SET role = 'write' WHERE account_id = %s AND record_type = 'lamindb_ulabel' """, (user_uuid,), ) ulabel.save() # can insert into lamindb_ulabelproject because the ulabel is now write-able # and the project is read-only, but this doesn't matter as the principal key is ulabel ulabel.projects.add(project) assert ulabel.projects.count() == 1 ulabel.delete(permanent=True) assert not ln.ULabel.filter(name="no_access_ulabel").exists() # tests that token is set properly in atomic blocks def test_atomic(): with transaction.atomic(): assert ln.Feature.filter().count() == 1 # test with nested with transaction.atomic(): assert ln.Feature.filter().count() == 1 feature = ln.Feature(name="atomic_feature", dtype=float) feature.space = ln.Space.get(name="full access") feature.save() assert ln.Feature.filter().count() == 2 def test_utility_tables(): # can select in these tables assert ln.Space.filter().count() == 5 # can't select assert hm.Account.filter().count() == 0 assert hm.Team.filter().count() == 0 assert hm.AccountTeam.filter().count() == 0 assert hm.AccessSpace.filter().count() == 0 assert hm.AccessRecord.filter().count() == 0 # can't update a space space = ln.Space.get(id=1) # default space space.name = "new name" with pytest.raises(ProgrammingError): space.save() with pytest.raises(ProgrammingError): ln.Space(name="new space").save() # can't insert with pytest.raises(ProgrammingError): hm.Account(id=uuid4().hex, uid="accntid2", role="admin").save() def test_user_rls(): assert ln.User.filter().count() == 2 # should fail because can modify only the current user user = ln.User.get(handle="testuser") user.name = "New Name" with pytest.raises(ProgrammingError): user.save() # can't insert a user with a different uid with pytest.raises(ProgrammingError): ln.User(handle="insert_new_user", uid="someuidd").save() # also triggers RLS with pytest.raises(ProgrammingError): ln.User(handle="insert_new_user", uid=user.uid).save() # try to insert a user with the same uid # should not trigger RLS because the uid is the same, it should throw an IntegrityError with pytest.raises(IntegrityError): ln.User(handle="insert_new_user", uid=ln.setup.settings.user.uid).save() # can modify the current user user = ln.User.get(1) user.name = "New Name" user.save() def test_write_role(): # switch user role to write with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( "UPDATE hubmodule_account SET role = 'write' WHERE id = %s", (user_uuid,) ) ln.ULabel(name="new label account default space").save() # switch user role back to read and team role to write with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( "UPDATE hubmodule_account SET role = 'read' WHERE id = %s", (user_uuid,) ) cur.execute( "UPDATE hubmodule_team SET role = 'write' WHERE uid = 'teamuiduid11'", ) ln.ULabel(name="new label team default space").save() with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( "UPDATE hubmodule_team SET role = 'read' WHERE uid = 'teamuiduid11'", ) def test_locking(): artifact = ln.Artifact.get(description="test locking") artifact.description = "new description" with pytest.raises(ln.errors.NoWriteAccess) as e: artifact.save() assert "It is not allowed to modify or create locked" in str(e) def test_tracking_error(): # switch user role to write to create the transform and run with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( "UPDATE hubmodule_account SET role = 'write' WHERE id = %s", (user_uuid,) ) artifact = ln.Artifact.get(description="test tracking error") transform = ln.Transform(key="My transform").save() run = ln.Run(transform).save() # this error because ln.setup.settings.instance._db_permissions is not jwt # it is None with pytest.raises(ln.errors.NoWriteAccess) as e: track_run_input(artifact, run) assert "You’re not allowed to write to the instance " in str(e) # the instance is local so we set this manually ln.setup.settings.instance._db_permissions = "jwt" # artifact.space is not available for writes with pytest.raises(ln.errors.NoWriteAccess) as e: track_run_input(artifact, run) assert "You’re not allowed to write to the space " in str(e) # this artifact is locked artifact = ln.Artifact.get(description="test locking") with pytest.raises(ln.errors.NoWriteAccess) as e: track_run_input(artifact, run) assert "It is not allowed to modify locked records" in str(e) # switch user role back to read with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute( "UPDATE hubmodule_account SET role = 'read' WHERE id = %s", (user_uuid,) ) # as the user is read-only now, 2 spaces are unavailable for writes (artifact.space, run.space) artifact = ln.Artifact.get(description="test tracking error") with pytest.raises(ln.errors.NoWriteAccess) as e: track_run_input(artifact, run) assert "You’re not allowed to write to the spaces " in str(e) ln.setup.settings.instance._db_permissions = None def test_token_reset(): db_token_manager.reset() # account_id is not set with pytest.raises(InternalError) as error: ln.ULabel.filter().count() assert "JWT is not set" in error.exconly() with pytest.raises(InternalError) as error, transaction.atomic(): ln.ULabel.filter().count() assert "JWT is not set" in error.exconly() def test_dbwrite_uninstall(): triggers_exist_query = ( "SELECT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname LIKE 'dbwrite_%')" ) table_exists_query = "SELECT to_regclass('public.hubmodule_dbwrite') IS NOT NULL" with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute(triggers_exist_query) triggers_exist = cur.fetchone()[0] assert triggers_exist uninstall_dbwrite(pgurl, drop_table=False) with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute(triggers_exist_query) triggers_exist = cur.fetchone()[0] assert not triggers_exist cur.execute(table_exists_query) table_exists = cur.fetchone()[0] assert table_exists uninstall_dbwrite(pgurl, drop_table=True) with psycopg2.connect(pgurl) as conn, conn.cursor() as cur: cur.execute(table_exists_query) table_exists = cur.fetchone()[0] assert not table_exists def test_lamin_dev(): script_path = Path(__file__).parent.resolve() / "scripts/check_lamin_dev.py" subprocess.run( # noqa: S602 f"python {script_path}", shell=True, check=True, ) ================================================ FILE: tests/profiling/import_lamindb.py ================================================ import lamindb as ln # noqa: F401 ================================================ FILE: tests/profiling/import_lamindb_and_connect.py ================================================ import lamindb as ln # should connect to another instance than laminlabs/lamindata # because the former is used to log the test run ln.connect("laminlabs/lamin-site-assets") ================================================ FILE: tests/profiling/import_lamindb_core_storage.py ================================================ import lamindb.core.storage # noqa: F401 ================================================ FILE: tests/profiling/import_records_from_dataframe.py ================================================ import argparse from datetime import datetime from random import Random from time import perf_counter import lamindb as ln import pandas as pd def generate_values(dtype: str, n_rows: int, rng: Random): cell_types = [ "T cell", "B cell", "natural killer cell", "monocyte", "epithelial cell", ] if dtype in {"float", "num"}: return [round(rng.uniform(0.0, 100.0), 3) for _ in range(n_rows)] if dtype.startswith("cat["): return [rng.choice(cell_types) for _ in range(n_rows)] raise ValueError(f"Unsupported dtype: {dtype}") @ln.flow("JuJZZEsit1KV") def main(n_rows: int): feature_names = [ "age_or_mean_of_age_range", "array_col", "cell_type_by_model", ] rng = Random(0) features = ln.Feature.filter(name__in=feature_names) dtypes_by_feature = {feature.name: feature.dtype_as_str for feature in features} data: dict[str, list] = {} print("Generating random dataframe values...") for feature in features: data[feature.name] = generate_values( dtypes_by_feature[feature.name], n_rows, rng ) df = pd.DataFrame(data) print(df.head(5)) print("Running Record.from_dataframe()...") from_dataframe_start = perf_counter() records = ln.Record.from_dataframe( df, type=f"test-import-records-from-dataframe-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}", ) from_dataframe_duration_sec = perf_counter() - from_dataframe_start print(f"... completed in {from_dataframe_duration_sec:.6f}s") print("Saving records...") save_start = perf_counter() records.save() save_duration_sec = perf_counter() - save_start print(f"... completed in {save_duration_sec:.6f}s") run = ln.context.run params = run.params or {} params.update( { "from_dataframe_duration_sec": round(from_dataframe_duration_sec, 6), "save_duration_sec": round(save_duration_sec, 6), } ) run.params = params run.save() if __name__ == "__main__": parser = argparse.ArgumentParser( description="Prepare and optionally save test Records rows via Record.from_dataframe()." ) parser.add_argument("--rows", type=int, default=100) args = parser.parse_args() ln.connect("laminlabs/lamindata") main(n_rows=args.rows) ================================================ FILE: tests/storage/conftest.py ================================================ import shutil from pathlib import Path from subprocess import DEVNULL, run from time import perf_counter import lamindb as ln import lamindb_setup as ln_setup import pytest from lamin_utils import logger from laminci.db import setup_local_test_postgres def create_test_instance(pgurl: str): ln_setup.init( storage="./default_storage_unit_storage", modules="bionty", name="lamindb-unit-tests-storage", db=pgurl, ) ln_setup.register() # temporarily ln.settings.creation.artifact_silence_missing_run_warning = True ln.settings.track_run_inputs = False ln.Storage("s3://lamindb-ci/test-data").save() ln.Storage("s3://lamindb-test/core").save() ln.Storage("s3://lamindb-test/storage").save() def pytest_sessionstart(): t_execute_start = perf_counter() ln_setup._TESTING = True try: pgurl = setup_local_test_postgres() except RuntimeError: run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL) # noqa: S602 pgurl = setup_local_test_postgres() try: create_test_instance(pgurl) except Exception as e: print("failed to create test instance:", e) print("deleting the instance") delete_test_instance() # below currently fails because cannot create two instances in the same session # create_test_instance(pgurl) print("now rerun") quit() total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") assert ln.Storage.filter(root="s3://lamindb-ci/test-data").one_or_none() is not None def delete_test_instance(): logger.set_verbosity(1) if Path("./default_storage_unit_storage").exists(): shutil.rmtree("./default_storage_unit_storage") # handle below better in the future for path in ( "s3://lamindb-test/storage/.lamindb", "s3://lamindb-test/core/.lamindb", "s3://lamindb-ci/lamindb-unit-tests-cloud/.lamindb", "s3://lamindb-ci/test-settings-switch-storage/.lamindb", ): upath = ln_setup.core.upath.UPath(path) if upath.exists(): upath.rmdir() ln_setup.delete("lamindb-unit-tests-storage", force=True) def pytest_sessionfinish(session: pytest.Session): delete_test_instance() run("docker stop pgtest && docker rm pgtest", shell=True, stdout=DEVNULL) # noqa: S602 @pytest.fixture def ccaplog(caplog): """Add caplog handler to our custom logger at session start.""" from lamin_utils._logger import logger # Add caplog's handler to our custom logger logger.addHandler(caplog.handler) yield caplog # Clean up at the end of the session logger.removeHandler(caplog.handler) ================================================ FILE: tests/storage/test_artifact_storage.py ================================================ import shutil import anndata as ad import lamindb as ln import pytest from lamindb.errors import ( IntegrityError, ) def test_create_from_anndata_in_existing_cloud_storage(): filepath = "s3://lamindb-test/core/scrnaseq_pbmc68k_tiny.h5ad" artifact = ln.Artifact.from_anndata( filepath, description="test_create_from_anndata_cloudpath" ) assert artifact.n_observations == 70 artifact.save() assert ln.Artifact.get(path=artifact.path) == artifact # check that the local filepath has been cleared assert not hasattr(artifact, "_local_filepath") assert artifact.path.as_posix().startswith("s3://lamindb-test/core") @pytest.mark.parametrize( "filepath_str", ["s3://lamindb-ci/test-data/test.parquet", "s3://lamindb-ci/test-data/test.csv"], ) @pytest.mark.parametrize("skip_check_exists", [False, True]) @pytest.mark.parametrize("skip_size_and_hash", [False, True]) def test_create_small_file_from_remote_path( filepath_str, skip_check_exists, skip_size_and_hash ): ln.settings.creation.artifact_skip_size_hash = skip_size_and_hash artifact = ln.Artifact( filepath_str, skip_check_exists=skip_check_exists, ) artifact.save() # test cache() file_from_local = ln.Artifact(artifact.cache(), description="test") # test hash equivalency when computed on local machine if not skip_size_and_hash: assert file_from_local.hash == artifact.hash assert file_from_local._hash_type == "md5" assert artifact._hash_type == "md5" assert artifact.path.as_posix() == filepath_str assert artifact.load().iloc[0].tolist() == [ 0, "Abingdon island giant tortoise", "Chelonoidis abingdonii", 106734, "ASM359739v1", "GCA_003597395.1", "Full genebuild", "-", "-", ] artifact.delete(permanent=True, storage=False) ln.settings.creation.artifact_skip_size_hash = False def test_versioning_arifact_from_existing_path(ccaplog): artifact1 = ln.Artifact("s3://lamindb-ci/test-data/test.parquet").save() artifact2 = ln.Artifact( "s3://lamindb-ci/test-data/test.csv", revises=artifact1 ).save() assert "you are saving to a non-latest version of the artifact" not in ccaplog.text assert artifact1.stem_uid == artifact2.stem_uid assert artifact1.uid != artifact2.uid artifact1.delete(permanent=True, storage=False) artifact2.delete(permanent=True, storage=False) def test_create_big_file_from_remote_path(): # the point of this test is check the multi-upload hash filepath_str = "s3://lamindb-test/core/human_immune.h5ad" # we don't use from_anndata() here because we test this with a small file for shorter run time artifact = ln.Artifact(filepath_str) assert not artifact._key_is_virtual assert artifact._real_key is None assert artifact.key == "human_immune.h5ad" assert artifact._hash_type == "md5-3" assert artifact.size == 21960324 assert artifact.path.as_posix() == filepath_str # check _real_key artifact = ln.Artifact(filepath_str, key="adata_test_key.h5ad") assert artifact._key_is_virtual assert artifact.key == "adata_test_key.h5ad" assert artifact._real_key.endswith("human_immune.h5ad") assert artifact.path.as_posix() == filepath_str def test_delete_artifact_from_non_managed_storage(): artifact = ln.Artifact( "s3://lamindb-dev-datasets/file-to-test-for-delete.csv", description="My test file to delete from non-default storage", ).save() assert artifact.storage.instance_uid != ln.setup.settings.instance.uid assert artifact.key is not None filepath = artifact.path with pytest.raises(IntegrityError) as e: artifact.delete() assert e.exconly().startswith( "lamindb.errors.IntegrityError: Cannot simply delete artifacts" ) artifact.delete(storage=False, permanent=True) assert ( ln.Artifact.filter( description="My test file to delete from non-default storage", branch_id=None, ).first() is None ) assert filepath.exists() def test_huggingface_paths(): artifact_adata = ln.Artifact( "hf://datasets/Koncopd/lamindb-test@main/anndata/pbmc68k_test.h5ad", description="hf adata", ) artifact_adata.save() assert artifact_adata.key == "anndata/pbmc68k_test.h5ad" assert artifact_adata.hash is not None assert isinstance(artifact_adata.load(), ad.AnnData) assert artifact_adata._cache_path.exists() artifact_adata._cache_path.unlink() artifact_pq = ln.Artifact( "hf://datasets/Koncopd/lamindb-test/sharded_parquet", description="hf parquet" ) artifact_pq.save() assert artifact_pq.hash is not None assert len(artifact_pq.open().files) == 11 assert artifact_pq.cache().is_dir() shutil.rmtree(artifact_pq._cache_path) artifact_adata.delete(permanent=True, storage=False) artifact_pq.delete(permanent=True, storage=False) def test_gcp_paths(): artifact_folder = ln.Artifact( "gs://rxrx1-europe-west4/images/test/HEPG2-08", description="Test GCP folder" ).save() assert artifact_folder.hash == "6r5Hkce0UTy7X6gLeaqzBA" assert artifact_folder.n_files == 14772 artifact_file = ln.Artifact( "gs://rxrx1-europe-west4/images/test/HEPG2-08/Plate1/B02_s1_w1.png", description="Test GCP file", ).save() assert artifact_file.hash == "foEgLjmuUHO62CazxN97rA" cache_path = artifact_file.cache() assert cache_path.is_file() cache_path.unlink() artifact_folder.delete(permanent=True, storage=False) artifact_file.delete(permanent=True, storage=False) def test_http_paths(): http_path = ln.UPath( "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/README.md" ) artifact_readme = ln.Artifact(http_path, description="register http readme").save() # might change assert artifact_readme.hash is not None cache_path = artifact_readme.cache() assert cache_path.exists() assert cache_path.stat().st_size == http_path.stat().st_size cache_path.unlink() # just check saving for the second time (when Strage record is in the db) artifact_license = ln.Artifact( "https://raw.githubusercontent.com/laminlabs/lamindb/refs/heads/main/LICENSE", description="register http license", ).save() assert artifact_license.hash == "IQxRSNjvb7w2OLFeWqYlsg" artifact_readme.delete(permanent=True, storage=False) artifact_license.delete(permanent=True, storage=False) # also see test in lamindb-setup/tests/storage/test_storage_stats.py # there is also a test for GCP there def test_folder_like_artifact_s3(): study0_data = ln.Artifact("s3://lamindata/iris_studies/study0_raw_images") assert study0_data.hash == "IVKGMfNwi8zKvnpaD_gG7w" assert study0_data._hash_type == "md5-d" assert study0_data.n_files == 51 assert study0_data.size == 658465 def test_single_file_directory_preserved(tmp_path): local_dir = tmp_path / "single_file_dir" local_dir.mkdir() (local_dir / "only.txt").write_text("single file") storage = ln.Storage.get(root="s3://lamindb-test/storage") artifact = ln.Artifact( local_dir, key="tests/single-file-directory", storage=storage ).save() assert artifact.path.as_posix().startswith("s3://lamindb-test/storage") assert artifact.n_files == 1 assert artifact.path.is_dir() assert [file.name for file in artifact.path.iterdir()] == ["only.txt"] artifact.delete(permanent=True) ================================================ FILE: tests/storage/test_artifact_zarr.py ================================================ import shutil from pathlib import Path import anndata as ad import lamindb as ln import numpy as np import pandas as pd import pytest from lamindb.core.storage._zarr import identify_zarr_type from lamindb_setup.core.upath import ( CloudPath, ) @pytest.fixture(scope="session") def get_small_adata(): return ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) def test_zarr_upload_cache(get_small_adata): previous_storage = ln.setup.settings.storage.root_as_str ln.settings.storage = "s3://lamindb-test/core" zarr_path = Path("./test_adata.zarr") get_small_adata.write_zarr(zarr_path) artifact = ln.Artifact(zarr_path, key="test_adata.zarr") assert not artifact._storage_ongoing assert artifact.otype == "AnnData" assert artifact.n_files >= 1 artifact.save() assert ln.Artifact.get(path=artifact.path) == artifact assert not artifact._storage_ongoing assert isinstance(artifact.path, CloudPath) assert artifact.path.exists() assert identify_zarr_type(artifact.path) == "anndata" shutil.rmtree(artifact.cache()) cache_path = artifact._cache_path assert isinstance(artifact.load(), ad.AnnData) assert cache_path.is_dir() shutil.rmtree(cache_path) assert not cache_path.exists() artifact.cache() assert cache_path.is_dir() artifact.delete(permanent=True, storage=True) shutil.rmtree(zarr_path) # test zarr from memory artifact = ln.Artifact(get_small_adata, key="test_adata.anndata.zarr") assert not artifact._storage_ongoing assert artifact._local_filepath.is_dir() assert artifact.otype == "AnnData" assert artifact.suffix == ".anndata.zarr" assert artifact.n_files >= 1 ln.save([artifact]) # use bulk save here for testing assert not artifact._storage_ongoing assert isinstance(artifact.path, CloudPath) assert artifact.path.exists() cache_path = artifact._cache_path assert cache_path.is_dir() shutil.rmtree(cache_path) assert not cache_path.exists() artifact._memory_rep = None assert isinstance(artifact.load(), ad.AnnData) assert cache_path.is_dir() artifact.delete(permanent=True, storage=True) ln.settings.storage = previous_storage ================================================ FILE: tests/storage/test_cache.py ================================================ import shutil from pathlib import Path from time import sleep import lamindb as ln import pytest from lamindb.core.loaders import load_h5ad from lamindb_setup._set_managed_storage import set_managed_storage # https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test # switch to cloud storage and back @pytest.fixture def switch_storage(): cloud_storage = "s3://lamindb-ci/lamindb-unit-tests-cloud" set_managed_storage(cloud_storage) yield cloud_storage set_managed_storage("./default_storage_unit_storage") def test_local_cache(): # check that we have local storage local_storage = Path("./default_storage_unit_storage").resolve().as_posix() assert ln.setup.settings.storage.root_as_str == local_storage test_file = ln.examples.datasets.anndata_file_pbmc68k_test() adata = load_h5ad(test_file) artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad") temp_path = artifact._local_filepath.resolve() assert temp_path.exists() assert ln.setup.settings.cache_dir in temp_path.parents artifact.save() assert artifact.path.exists() assert not temp_path.exists() artifact.delete(permanent=True) # check directories adata_zarr_pth = Path("test_adata.zarr") adata.write_zarr(adata_zarr_pth) assert adata_zarr_pth.exists() artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr").save() assert adata_zarr_pth.exists() assert artifact.path.exists() assert artifact.path.name != artifact.key shutil.rmtree(adata_zarr_pth) artifact.delete(permanent=True) # check directories in cache cache_dir = ln.setup.settings.cache_dir adata_zarr_pth = cache_dir / "test_adata.zarr" adata.write_zarr(adata_zarr_pth) artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr") assert adata_zarr_pth.exists() artifact.save() assert not adata_zarr_pth.exists() assert artifact.path.exists() assert artifact.path.name != artifact.key artifact.delete(permanent=True) def test_cloud_cache(switch_storage): # check that we have cloud storage assert ln.setup.settings.storage.root_as_str == switch_storage cache_dir = ln.setup.settings.cache_dir assert cache_dir is not None test_file = ln.examples.datasets.anndata_file_pbmc68k_test() # test cache for saving an in-memory object adata = load_h5ad(test_file) artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad") temp_path = artifact._local_filepath.resolve() assert cache_dir in temp_path.parents artifact.save() assert not temp_path.exists() cloud_path = artifact.path cache_path = artifact._cache_path assert cache_path.exists() assert ( cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime artifact.delete(permanent=True) # test cache for saving an on-disk object artifact = ln.Artifact.from_anndata(test_file, key="test_cache.h5ad") artifact.save() cloud_path = artifact.path cache_path = artifact._cache_path assert cache_path.exists() assert ( cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) assert test_file.stat().st_mtime < cache_path.stat().st_mtime assert cloud_path.modified.timestamp() < cache_path.stat().st_mtime artifact.delete(permanent=True) # test cache for a directory on-disk object outside the cache dir adata_zarr_pth = Path("test_adata.zarr") adata.write_zarr(adata_zarr_pth) artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr") artifact.save() assert adata_zarr_pth.is_dir() cache_path = artifact._cache_path assert cache_path.is_dir() assert ( cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr" ) shutil.rmtree(adata_zarr_pth) artifact.delete(permanent=True) # inside the cache dir adata_zarr_pth = cache_dir / "test_adata.zarr" adata.write_zarr(adata_zarr_pth) artifact = ln.Artifact(adata_zarr_pth, key="test_cache.zarr") assert adata_zarr_pth.exists() artifact.save() assert not adata_zarr_pth.exists() cache_path = artifact._cache_path assert cache_path.is_dir() assert ( cache_path == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.zarr" ) artifact.delete(permanent=True) def test_cloud_cache_versions(switch_storage): adata = load_h5ad(ln.examples.datasets.anndata_file_pbmc68k_test()) cache_dir = ln.setup.settings.cache_dir assert cache_dir is not None artifact = ln.Artifact.from_anndata(adata, key="test_cache.h5ad") assert ln.settings.cache_dir in artifact._local_filepath.parents artifact.save() cache_path_v1 = artifact.cache() assert cache_path_v1.exists() assert ( cache_path_v1 == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) cache_path_v1.unlink() artifact.cache(print_progress=False) assert cache_path_v1.exists() assert ( cache_path_v1 == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) timestamp_v1 = cache_path_v1.stat().st_mtime # hope it is enough to avoid random timestamp problems further sleep(1) # new version adata.obs["test_cache"] = "test" artifact_v2 = ln.Artifact.from_anndata( adata, key="test_cache.h5ad", revises=artifact ) assert ln.settings.cache_dir in artifact_v2._local_filepath.parents artifact_v2.save() assert artifact_v2.is_latest assert not artifact.is_latest cache_path_v2 = artifact_v2.cache() assert cache_path_v2.exists() assert ( cache_path_v2 == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) assert cache_path_v2.stat().st_mtime > timestamp_v1 cache_path_v2.unlink() artifact_v2.cache(mute=True) assert cache_path_v2.exists() assert ( cache_path_v2 == cache_dir / "lamindb-ci/lamindb-unit-tests-cloud/test_cache.h5ad" ) assert "test_cache" in load_h5ad(cache_path_v2).obs.columns cache_mtime = cache_path_v2.stat().st_mtime assert cache_mtime == artifact_v2.path.modified.timestamp() assert cache_mtime > timestamp_v1 # old version cache ignores key cache_path_v1 = artifact.cache() assert cache_path_v1.exists() assert cache_path_v1.name == f"{artifact.uid}.h5ad" artifact_v2.versions.delete(permanent=True) def test_corrupted_cache_local(): filepath = ln.examples.datasets.anndata_file_pbmc68k_test() artifact = ln.Artifact.from_anndata(filepath, key="test_corrupt_cache_local.h5ad") artifact.save() # corrupt cache with open(artifact._cache_path, "r+b") as f: f.write(b"corruption") # just raises an exception, nothing to re-sync on local with pytest.raises(OSError): artifact.load() with pytest.raises(OSError): artifact.open() artifact.delete(permanent=True) def test_corrupted_cache_cloud(switch_storage): # check that we have cloud storage assert ln.setup.settings.storage.root_as_str == switch_storage filepath = ln.examples.datasets.anndata_file_pbmc68k_test() artifact = ln.Artifact.from_anndata(filepath, key="test_corrupt_cache_cloud.h5ad") artifact.save() # corrupt cache # sleep not to reset cache mtime to a smaller value # it is increased artificially on cache copying in save # so due to lower granularity of cloud mtimes and fast code execution # after the change cache mtime can become smaller than cloud mtime sleep(1) with open(artifact._cache_path, "r+b") as f: f.write(b"corruption") assert artifact._cache_path.stat().st_mtime > artifact.path.stat().st_mtime # check that it is indeed corrupted with pytest.raises(OSError): load_h5ad(artifact.cache()) # should load successfully artifact.load() # check open also assert artifact._cache_path.exists() with open(artifact._cache_path, "r+b") as f: f.write(b"corruption") # should open successfully with artifact.open(): pass # corrupted cache has been deleted assert not artifact._cache_path.exists() artifact.delete(permanent=True) ================================================ FILE: tests/storage/test_connect_reconnect.py ================================================ import lamindb as ln import pytest def test_connect_reconnect(): # testuser2 needs write access lamin-site-assets because of a fluke # in the legacy collaborator management, it seems assert ln.setup.settings.user.handle == "testuser2" ln.connect("lamindb-unit-tests-storage") # this is not changing anything count1 = ln.Artifact.filter().count() # a public instance that does not have bionty configured ln.connect("laminlabs/lamin-site-assets") count2 = ln.Artifact.filter().count() assert count1 != count2 with pytest.raises(ln.setup.errors.ModuleWasntConfigured): import bionty as bt ln.connect("lamindb-unit-tests-storage") import bionty as bt count3 = bt.Gene.filter().count() assert count2 != count3 ================================================ FILE: tests/storage/test_storage_lifecycle.py ================================================ from pathlib import Path import lamindb as ln import pytest from lamindb_setup.core._hub_core import get_storage_records_for_instance def check_storage_location_on_hub_exists(uid: str): all_storage_records = get_storage_records_for_instance( ln.setup.settings.instance._id ) length = len([r for r in all_storage_records if r["lnid"] == uid]) if length not in {0, 1}: raise AssertionError( f"Expected 0 or 1 storage records for uid {uid}, found {length}." ) return length == 1 def test_reference_storage_location(ccaplog): ln.Artifact("s3://lamindata/iris_studies/study0_raw_images") assert ln.Storage.get(root="s3://lamindata").instance_uid == "4XIuR0tvaiXM" # assert ( # "referenced read-only storage location at s3://lamindata, is managed by instance with uid 4XIuR0tvaiXM" # in ccaplog.text # ) def test_switch_delete_storage_location(): ln.settings.storage = "./default_storage_unit_storage" assert ( ln.settings.storage.root.resolve() == Path("./default_storage_unit_storage").resolve() ) new_storage_location = "s3://lamindb-ci/test-settings-switch-storage" ln.Storage(new_storage_location).save() ln.settings.storage = new_storage_location assert ln.setup.settings.storage.type_is_cloud assert ln.setup.settings.storage.root_as_str == new_storage_location # root.fs contains the underlying fsspec filesystem # the following is set by lamindb to True for s3 by default assert ln.setup.settings.storage.root.fs.cache_regions ln.settings.storage = new_storage_location, {"cache_regions": False} assert not ln.setup.settings.storage.root.fs.cache_regions assert ln.setup.settings.storage.root.exists() # now work with the new storage location new_storage = ln.Storage.get(root=new_storage_location) assert check_storage_location_on_hub_exists(new_storage.uid) artifact = ln.Artifact(".gitignore", key="test_artifact").save() assert new_storage.root in artifact.path.as_posix() # artifacts exist with pytest.raises(AssertionError) as err: new_storage.delete() assert "Cannot delete storage with artifacts in current instance." in err.exconly() artifact.delete(permanent=True, storage=False) # still some files in there with pytest.raises(ln.setup.errors.StorageNotEmpty) as err: new_storage.delete() assert ( "'s3://lamindb-ci/test-settings-switch-storage/.lamindb' contains 1 objects" in err.exconly() ) # now delete the artifact so that the storage location is empty artifact.path.unlink() with pytest.raises(AssertionError) as err: new_storage.delete() assert ( "Cannot delete the current storage location, switch to another." in err.exconly() ) # check all attempts unsuccessful so far assert check_storage_location_on_hub_exists(new_storage.uid) # switch back to default storage ln.settings.storage = "./default_storage_unit_storage" storage_marker = ln.UPath(new_storage_location) / ".lamindb/storage_uid.txt" assert storage_marker.exists() new_storage.delete() assert not check_storage_location_on_hub_exists(new_storage.uid) assert not storage_marker.exists() ================================================ FILE: tests/storage/test_streaming.py ================================================ import gzip import shutil from pathlib import Path import anndata as ad import h5py import lamindb as ln import numpy as np import pandas as pd import pytest import zarr from lamindb.core.loaders import load_h5ad from lamindb.core.storage._anndata_accessor import _anndata_n_observations, _to_index from lamindb.core.storage._backed_access import ( _flat_suffixes, backed_access, ) from lamindb.core.storage._polars_lazy_df import _open_polars_lazy_df, _polars_options from lamindb.core.storage._pyarrow_dataset import _open_pyarrow_dataset from lamindb.core.storage._zarr import load_zarr from lamindb.core.storage.objects import infer_suffix, write_to_disk @pytest.fixture def bad_adata_path(): fp = ln.examples.datasets.anndata_file_pbmc68k_test() adata = load_h5ad(fp) to = fp.with_name("pbmc68k_bad.h5ad") shutil.copy(fp, to) fp = to file = h5py.File(fp, mode="r+") for field_name in ("obs", "var"): field = getattr(adata, field_name).to_records() formats = [] for name, (dt, _) in field.dtype.fields.items(): if dt == "O": new_dt = str(field[name].astype(str).dtype).replace(" 50 assert "artifact" in captured.out.lower() def test_transfer_from_remote_to_local(ccaplog): """Test transfer from remote to local instance.""" bt.Gene.filter().delete(permanent=True) bt.Organism.filter().delete(permanent=True) ln.ULabel.filter().delete(permanent=True) bt.CellType.filter().delete(permanent=True) # test transfer from an instance with an extra schema module: pertdb # we also made sure that the artifact here has a pertdb label attached # transfer 1st artifact artifact1 = ln.Artifact.connect("laminlabs/lamin-dev").get("livFRRpM") # test describe postgres result = get_artifact_or_run_with_related( artifact1, include_m2m=True, include_fk=True, include_feature_link=True, include_schema=True, ) assert result["related_data"]["m2m"]["tissues"] == { 2: { "id": 2, "uid": "6VHBo6XsJZqmaQ", "abbr": None, "name": "cortex of kidney", "tissue": 2, "feature": None, "ontology_id": "UBERON:0001225", "tissue_display": "cortex of kidney", } } assert sorted( result["related_data"]["link"]["links_ulabel"], key=lambda d: d["id"] ) == [ { "id": 7, "uid": "ydyPUMjh", "name": "donor_24", "ulabel": 15, "feature": 1, "reference": None, "reference_type": None, "ulabel_display": "donor_24", }, { "id": 8, "uid": "JJ3d8a2v", "name": "na", "ulabel": 10, "feature": 10, "reference": None, "reference_type": None, "ulabel_display": "na", }, ] assert result["related_data"]["m2m_schemas"][615][0] == "obs" assert result["related_data"]["m2m_schemas"][615][1] == { "Feature": [ "donor_id", "development_stage", "disease", "cell_type", "sex", "assay", "tissue", "self_reported_ethnicity", "tissue_type", "suspension_type", "organism", ] } assert result["related_data"]["fk"]["storage"] == { "id": 4, "name": "s3://cellxgene-data-public", } id_remote = artifact1.id run_remote = artifact1.run transform_remote = artifact1.transform created_by_remote = artifact1.created_by storage_remote = artifact1.storage organism_remote = artifact1.organisms.get(name="human") artifact1.save(transfer="annotations") # assert MODULE_WASNT_CONFIGURED_MESSAGE_TEMPLATE.format("pertdb") in ccaplog.text # check all ids are adjusted assert id_remote != artifact1.id assert run_remote != artifact1.run assert transform_remote != artifact1.transform assert created_by_remote.handle != artifact1.created_by.handle assert storage_remote.uid == artifact1.storage.uid assert storage_remote.created_at == artifact1.storage.created_at organism = artifact1.organisms.get(name="human") assert organism.created_at != organism_remote.created_at # now check that this is idempotent and we can run it again artifact_repeat = ln.Artifact.connect("laminlabs/lamin-dev").get( "livFRRpMaOgb3y8U2mK2" ) artifact_repeat.save(transfer="annotations") # now prepare a new test case # mimic we have an existing feature with a different uid but same name feature = ln.Feature.get(name="organism") feature.uid = "existing" feature.save() # transfer 2nd artifact artifact2 = ln.Artifact.connect("laminlabs/lamin-dev").get("qz35YaRk") artifact2.save(transfer="annotations") # check the feature name assert artifact2.organisms.get(name="mouse") assert ( artifact1.features.slots["obs"].members.get(name="organism").uid == "existing" ) # test transfer from an instance with fewer modules (laminlabs/lamin-site-assets) artifact3 = ln.Artifact.connect("laminlabs/lamin-site-assets").get( "lgRNHNtMxjU0y8nIagt7" ) # test that implicit saving through `load()` works (also occurs for `cache()` or `open()` for run input tracking) artifact3.load() # delete with storage=False, because these are all stored in the source instances artifact1.delete(storage=False, permanent=True) artifact2.delete(storage=False, permanent=True) artifact3.delete( storage=False ) # there is an issue here with permanent deletion because of schema module mismatch def test_transfer_into_space(): # grab any ulabel from the default space ulabel = ln.ULabel.connect("laminlabs/lamin-dev").filter(space__id=1).first() space = ln.Space(name="space for transfer", uid="00000123").save() with patch.object(ln.context, "_space", new=space): ulabel.save() assert ulabel.space_id == space.id ulabel.delete(permanent=True) space.delete() def test_using_record_organism(): """Test passing record and organism to the using_key instance.""" import bionty as bt release_110_cxg = bt.Source.connect("laminlabs/lamin-dev").get( organism="mouse", entity="bionty.Gene", version="release-110" ) release_112_cxg = bt.Source.connect("laminlabs/lamin-dev").get( organism="mouse", entity="bionty.Gene", version="release-112" ) release_110 = release_110_cxg.save() # transfer source record release_110_cxg = ( # re-fetch bt.Source.connect("laminlabs/lamin-dev").get( organism="mouse", entity="bionty.Gene", version="release-110" ) ) # passing the wrong source inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect( ["ENSMUSG00000102862", "ENSMUSG00000084826"], field=bt.Gene.ensembl_gene_id, source=release_112_cxg, strict_source=True, ) assert len(inspector.validated) == 0 # passing the correct source inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect( ["ENSMUSG00000102862", "ENSMUSG00000084826"], field=bt.Gene.ensembl_gene_id, source=release_110_cxg, strict_source=True, ) assert len(inspector.validated) == 2 # passing the correct source but from the wrong instance with pytest.raises(ValueError) as error: inspector = bt.Gene.connect("laminlabs/lamin-dev").inspect( ["ENSMUSG00000102862", "ENSMUSG00000084826"], field=bt.Gene.ensembl_gene_id, source=release_110, ) assert ( "record must be a bionty.Source record from instance 'laminlabs/lamin-dev'" in str(error.value) ) def test_using_query_by_feature(): assert ln.Artifact.connect("laminlabs/cellxgene").filter(n_of_donors__gte=100) # TODO: uncomment after migrations # def test_transfer_features_uid(): # """Test that a new feature is created based on uid.""" # existing_tissue_feature = ( # ln.Feature.connect("laminlabs/lamin-dev").get(name="tissue").save() # ) # artifact = ln.Artifact.connect("laminlabs/pertdata").get("aT2dp4hC6XDwrafN") # artifact.save(transfer="annotations") # # now a new feature called "tissue" is created because the uid is different # newly_transferred_tissue_feature = ln.Feature.get( # name="tissue", schemas__artifacts__uid=artifact.uid # ) # assert existing_tissue_feature.uid != newly_transferred_tissue_feature.uid ================================================ FILE: tests/tiledbsoma/conftest.py ================================================ import os import shutil from pathlib import Path from time import perf_counter import lamindb as ln import lamindb_setup as ln_setup import numpy as np import pandas as pd import pytest from lamin_utils import logger def pytest_sessionstart(): t_execute_start = perf_counter() ln_setup._TESTING = True os.environ["LAMIN_TESTING"] = "true" os.environ["LAMINDB_TEST_DB_VENDOR"] = "sqlite" print("running tests on SQLite") ln.setup.init( storage="./default_storage_tiledbsoma", modules="bionty", name="lamindb-unit-tests-tiledbsoma", ) ln.settings.creation.artifact_silence_missing_run_warning = True # Pre-register remote roots used in tests so `ln.settings.storage = ...` # doesn't prompt for interactive confirmation under pytest capture. ln.Storage("s3://lamindb-test/tiledbsoma").save() total_time_elapsed = perf_counter() - t_execute_start print(f"time to setup the instance: {total_time_elapsed:.1f}s") def pytest_sessionfinish(session: pytest.Session): logger.set_verbosity(1) if Path("./default_storage_tiledbsoma").exists(): shutil.rmtree("./default_storage_tiledbsoma") upath = ln_setup.core.upath.UPath("s3://lamindb-test/tiledbsoma") if upath.exists(): upath.rmdir() ln.setup.delete("lamindb-unit-tests-tiledbsoma", force=True) del os.environ["LAMIN_TESTING"] @pytest.fixture(scope="session") def adata_file(): import anndata as ad adata = ad.AnnData( X=np.array([[1, 2, 3], [4, 5, 6]]), obs={"feat1": ["A", "B"]}, var=pd.DataFrame(index=["MYC", "TCF7", "GATA1"]), obsm={"X_pca": np.array([[1, 2], [3, 4]])}, ) filepath = Path("adata_file.h5ad") adata.write(filepath) yield "adata_file.h5ad" filepath.unlink(missing_ok=True) @pytest.fixture(scope="function") def clean_soma_files(request): path = request.param if hasattr(request, "param") else "small_dataset.tiledbsoma" if Path(path).exists(): shutil.rmtree(path) yield path if Path(path).exists(): shutil.rmtree(path) @pytest.fixture(scope="function") def soma_experiment_file(clean_soma_files): import tiledbsoma.io adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") tiledbsoma.io.from_anndata("test.tiledbsoma", adata, measurement_name="RNA") yield "test.tiledbsoma" if Path("test.tiledbsoma").exists(): shutil.rmtree("test.tiledbsoma") ================================================ FILE: tests/tiledbsoma/test_artifact_basics.py ================================================ import lamindb as ln import pytest from lamindb.models.artifact import data_is_soma_experiment def test_create_from_soma_experiment(soma_experiment_file, adata_file): with pytest.raises(ValueError) as error: ln.Artifact.from_tiledbsoma(adata_file, description="test1") assert ( "data has to be a SOMA Experiment object or a path to SOMA Experiment store." in error.exconly() ) af = ln.Artifact.from_tiledbsoma(soma_experiment_file, description="test1") assert af.description == "test1" assert af.key is None assert af.otype == "tiledbsoma" assert af.n_observations == 3 def test_data_is_soma_experiment_paths(): assert data_is_soma_experiment("something.tiledbsoma") assert data_is_soma_experiment(ln.UPath("something.tiledbsoma")) def test_data_is_soma_experiment(soma_experiment_file): import tiledbsoma with tiledbsoma.Experiment.open(soma_experiment_file) as store: assert data_is_soma_experiment(store) ================================================ FILE: tests/tiledbsoma/test_curators.py ================================================ import shutil import bionty as bt import lamindb as ln import pytest import tiledbsoma import tiledbsoma.io def test_tiledbsoma_curator(clean_soma_files): """Test TiledbSomaExperimentCurator with schema.""" obs_schema = ln.Schema( features=[ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(), ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(), ], ).save() var_schema = ln.Schema( features=[ ln.Feature(name="var_id", dtype=bt.Gene.ensembl_gene_id).save(), ], coerce=True, ).save() soma_schema = ln.Schema( otype="tiledbsoma", slots={ "obs": obs_schema, "ms:RNA": var_schema, }, ).save() # Convert AnnData to SOMA format adata = ln.examples.datasets.mini_immuno.get_dataset1(otype="AnnData") tiledbsoma.io.from_anndata( "small_dataset.tiledbsoma", adata, measurement_name="RNA" ) # Test with invalid dataset with pytest.raises(ln.errors.InvalidArgument) as e: ln.curators.TiledbsomaExperimentCurator(adata, soma_schema) assert "dataset must be SOMAExperiment-like." in str(e.value) # Test with invalid schema with tiledbsoma.Experiment.open("small_dataset.tiledbsoma") as experiment: with pytest.raises(ln.errors.InvalidArgument) as e: ln.curators.TiledbsomaExperimentCurator(experiment, schema=var_schema) assert "Schema otype must be 'tiledbsoma'." in str(e.value) with tiledbsoma.Experiment.open("small_dataset.tiledbsoma") as experiment: curator = ln.curators.TiledbsomaExperimentCurator(experiment, soma_schema) assert "obs" in curator.slots assert "ms:RNA" in curator.slots curator.validate() artifact = curator.save_artifact( key="examples/soma_experiment.tiledbsoma", description="SOMA experiment with schema validation", ) assert artifact.schema == soma_schema assert "obs" in artifact.features.slots assert "ms:RNA" in artifact.features.slots # Check feature values are properly annotated assert set(artifact.features.get_values()["cell_type_by_expert"]) == { "CD8-positive, alpha-beta T cell", "B cell", } assert set(artifact.features.get_values()["cell_type_by_model"]) == { "T cell", "B cell", } # Altered data (gene typo) adata_typo = ln.examples.datasets.mini_immuno.get_dataset1( otype="AnnData", with_gene_typo=True ) typo_soma_path = "./mini_immuno_dataset1_typo.tiledbsoma" tiledbsoma.io.from_anndata(typo_soma_path, adata_typo, measurement_name="RNA") with tiledbsoma.Experiment.open(typo_soma_path) as experiment_typo: curator_typo = ln.curators.TiledbsomaExperimentCurator( experiment_typo, soma_schema ) # Validation should fail due to typo with pytest.raises(ln.errors.ValidationError) as error: curator_typo.validate() assert "GeneTypo" in str(error.value) # Clean up shutil.rmtree(typo_soma_path) artifact.delete(permanent=True) soma_schema.delete(permanent=True) var_schema.delete(permanent=True) obs_schema.delete(permanent=True) ================================================ FILE: tests/tiledbsoma/test_storage.py ================================================ import shutil from pathlib import Path import lamindb as ln import numpy as np import pytest import tiledbsoma import tiledbsoma.io from lamindb.core.loaders import load_h5ad from lamindb.core.storage._tiledbsoma import ( SOMAS3ContextFactory, _open_tiledbsoma, _soma_store_n_observations, ) from lamindb.integrations import save_tiledbsoma_experiment @pytest.mark.parametrize("storage", [None, "s3://lamindb-test/tiledbsoma"]) def test_write_read_tiledbsoma(storage): if storage is not None: previous_storage = ln.setup.settings.storage.root_as_str ln.settings.storage = storage test_file = ln.examples.datasets.anndata_file_pbmc68k_test() adata = load_h5ad(test_file) # write less adata = adata[:5, :2].copy() del adata.varp del adata.obsp del adata.layers del adata.uns # seems to cause problems for append if storage is None: # test local with zarr test_file = test_file.with_suffix(".zarr") adata.write_zarr(test_file) else: adata.write_h5ad(test_file) create_transform = ln.Transform(key="test create tiledbsoma store").save() create_run = ln.Run(create_transform).save() # fails with a view with pytest.raises(ValueError, match="Can not write an `AnnData` view"): save_tiledbsoma_experiment([adata[:2]], run=create_run, measurement_name="RNA") artifact_soma = save_tiledbsoma_experiment( [test_file], description="test tiledbsoma", key="scrna/my-big-dataset.tiledbsoma", # can also be None, but that's trivial run=create_run, measurement_name="RNA", ) assert artifact_soma.path.stem == artifact_soma.uid[:16] assert artifact_soma.key == "scrna/my-big-dataset.tiledbsoma" assert artifact_soma.suffix == ".tiledbsoma" assert artifact_soma._key_is_virtual assert artifact_soma.otype == "tiledbsoma" assert artifact_soma.n_observations == adata.n_obs with artifact_soma.open() as store: # mode="r" by default assert isinstance(store, tiledbsoma.Experiment) obs = store["obs"] n_obs = len(obs) assert n_obs == adata.n_obs assert "lamin_run_uid" in obs.schema.names run_ids = ( obs.read(column_names=["lamin_run_uid"]) .concat() .to_pandas()["lamin_run_uid"] ) assert all(run_ids == create_run.uid) assert set(run_ids.cat.categories) == {create_run.uid} # test reading X ms_rna = store.ms["RNA"] n_vars = len(ms_rna.var) assert n_vars == adata.n_vars X = ms_rna["X"]["data"].read().coos((n_obs, n_vars)).concat().to_scipy() assert X.sum() == adata.X.sum() cache_path = artifact_soma.cache() hash_before_changes = artifact_soma.hash with artifact_soma.open(mode="w") as store: assert store.__class__.__name__ == "ExperimentTrack" tiledbsoma.io.add_matrix_to_collection( exp=store, measurement_name="RNA", collection_name="obsm", matrix_name="test_array", matrix_data=np.ones((n_obs, 2)), ) assert artifact_soma.hash != hash_before_changes assert artifact_soma.uid.endswith("0001") if storage is not None: # cache should be ignored and deleted after the changes assert not cache_path.exists() else: assert artifact_soma.path == cache_path adata_to_append_1 = adata[:3].copy() adata_to_append_1.obs["obs_id"] = adata_to_append_1.obs.index.to_numpy() + "***" adata_to_append_1.var["var_id"] = adata_to_append_1.var.index adata_to_append_2 = adata[3:5].copy() adata_to_append_2.obs["obs_id"] = adata_to_append_2.obs.index.to_numpy() + "***" adata_to_append_2.var["var_id"] = adata_to_append_2.var.index adata_to_append_2.write_h5ad("adata_to_append_2.h5ad") append_transform = ln.Transform(key="test append tiledbsoma store").save() append_run = ln.Run(append_transform).save() # here run should be passed with pytest.raises(ValueError, match="Pass `run`"): save_tiledbsoma_experiment( [adata_to_append_1], revises=artifact_soma, run=None, measurement_name="RNA", ) artifact_soma_append = save_tiledbsoma_experiment( [adata_to_append_1, "adata_to_append_2.h5ad"], revises=artifact_soma, run=append_run, measurement_name="RNA", append_obsm_varm=True, ) assert artifact_soma_append.uid.endswith("0002") artifact_soma.refresh_from_db() assert not artifact_soma.is_latest match = "its files were overwritten and are no longer available" with pytest.raises(ValueError, match=match): artifact_soma.open() with pytest.raises(ValueError, match=match): artifact_soma.load() with pytest.raises(ValueError, match=match): artifact_soma.cache() # below is inherited from "scrna/my-big-dataset.tiledbsoma" assert artifact_soma_append.key == "scrna/my-big-dataset.tiledbsoma" # wrong mode, should be either r or w for tiledbsoma with pytest.raises(ValueError): artifact_soma_append.open(mode="p") # test running without the context manager store = artifact_soma_append.open() n_obs_final = adata.n_obs + sum( adt.n_obs for adt in [adata_to_append_1, adata_to_append_2] ) obs = store["obs"] assert len(obs) == n_obs_final == artifact_soma_append.n_observations run_ids = ( obs.read(column_names=["lamin_run_uid"]) .concat() .to_pandas()["lamin_run_uid"] .cat.categories ) assert set(run_ids) == {create_run.uid, append_run.uid} store.close() # test correctness of deletion for _overwrite_versions=True soma_path = artifact_soma_append.path assert soma_path.exists() # select specific version and delete # check that the store is stil there assert soma_path.exists() assert ln.Artifact.filter(description="test tiledbsoma").count() == 3 artifact_soma_append.versions.filter(uid__endswith="0001").one().delete( permanent=True ) assert soma_path.exists() assert ln.Artifact.filter(description="test tiledbsoma").count() == 2 # make sure it the store is actually deleted artifact_soma_append.delete(permanent=True) assert not soma_path.exists() assert not ln.Artifact.filter(description="test tiledbsoma").exists() Path("adata_to_append_2.h5ad").unlink() if storage is not None: ln.settings.storage = previous_storage def test_from_tiledbsoma(): test_file = ln.examples.datasets.anndata_file_pbmc68k_test() soma_path = "mystore.tiledbsoma" tiledbsoma.io.from_h5ad(soma_path, test_file, measurement_name="RNA") # wrong suffix with pytest.raises(ValueError): ln.Artifact.from_tiledbsoma("mystore") artifact = ln.Artifact.from_tiledbsoma( soma_path, description="test soma store" ).save() assert artifact.n_observations == 30 with _open_tiledbsoma(artifact.path, mode="r") as store: # experiment assert _soma_store_n_observations(store) == 30 # dataframe assert _soma_store_n_observations(store.obs) == 30 # treat as unstructured collection, data + raw assert _soma_store_n_observations(store.ms) == 60 # measurement assert _soma_store_n_observations(store.ms["RNA"]) == 30 # array assert _soma_store_n_observations(store.ms["RNA"]["X"]["data"]) == 30 artifact.delete(permanent=True) shutil.rmtree(soma_path) def test_tiledb_config(): storepath = ln.UPath("s3://bucket/key?endpoint_url=http://localhost:9000/s3") tiledb_config = SOMAS3ContextFactory(storepath).get_context().tiledb_config assert tiledb_config["vfs.s3.endpoint_override"] == "localhost:9000/s3" assert tiledb_config["vfs.s3.scheme"] == "http" assert tiledb_config["vfs.s3.use_virtual_addressing"] == "false" assert tiledb_config["vfs.s3.region"] == "" def test_tiledbsoma_in_managed_storage(): artifact = ln.Artifact.connect("laminlabs/lamindata").get( key="example_datasets/small_dataset1.tiledbsoma" ) path = artifact.path assert "session" in path.storage_options ctx_factory = SOMAS3ContextFactory(path) assert ctx_factory._refreshable_credentials is not None ctx = ctx_factory.get_context() tiledb_config = ctx.tiledb_config assert "vfs.s3.aws_access_key_id" in tiledb_config assert "vfs.s3.aws_secret_access_key" in tiledb_config assert "vfs.s3.aws_session_token" in tiledb_config path_str = path.as_posix() # check with managed credentials with tiledbsoma.Experiment.open(path_str, mode="r", context=ctx) as store: assert _soma_store_n_observations(store) == 3 # check with anon, s3://lamindata is public with _open_tiledbsoma(ln.UPath(path_str, anon=True), mode="r") as store: assert _soma_store_n_observations(store) == 3 # pass credentials manually key = tiledb_config["vfs.s3.aws_access_key_id"] secret = tiledb_config["vfs.s3.aws_secret_access_key"] token = tiledb_config["vfs.s3.aws_session_token"] with _open_tiledbsoma( ln.UPath(path_str, key=key, secret=secret, token=token), mode="r" ) as store: assert _soma_store_n_observations(store) == 3