Full Code of xarray-contrib/xarray-tutorial for AI

main 27e9c2c74739 cached

110 files

682.0 KB

202.9k tokens

1 requests

Download .txt

Showing preview only (720K chars total). Download the full file or copy to clipboard to get everything.

Repository: xarray-contrib/xarray-tutorial
Branch: main
Commit: 27e9c2c74739
Files: 110
Total size: 682.0 KB

Directory structure:
gitextract_fn454f6u/

├── .binder/
│   └── environment.yml
├── .devcontainer/
│   ├── Dockerfile
│   ├── devcontainer.json
│   ├── scipy2023/
│   │   ├── devcontainer.json
│   │   ├── jupyter_lab_config.py
│   │   └── tasks.json
│   ├── scipy2024/
│   │   ├── devcontainer.json
│   │   ├── jupyter_lab_config.py
│   │   └── tasks.json
│   └── scipy2025/
│       ├── Dockerfile
│       └── devcontainer.json
├── .gitattributes
├── .github/
│   ├── actions/
│   │   └── setup-pixi/
│   │       └── action.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── main.yaml
│       ├── nocache.yaml
│       ├── pull_request.yaml
│       ├── qaqc.yaml
│       └── surge_preview.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierrc.toml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── _config.yml
├── _static/
│   └── style.css
├── _toc.yml
├── advanced/
│   ├── accessors/
│   │   ├── 01_accessor_examples.ipynb
│   │   └── accessors.md
│   ├── apply_ufunc/
│   │   ├── apply_ufunc.md
│   │   ├── automatic-vectorizing-numpy.ipynb
│   │   ├── complex-output-numpy.ipynb
│   │   ├── core-dimensions.ipynb
│   │   ├── dask_apply_ufunc.ipynb
│   │   ├── example-interp.ipynb
│   │   ├── numba-vectorization.ipynb
│   │   └── simple_numpy_apply_ufunc.ipynb
│   ├── backends/
│   │   ├── 1.Backend_without_Lazy_Loading.ipynb
│   │   ├── 2.Backend_with_Lazy_Loading.ipynb
│   │   └── backends.md
│   ├── indexing/
│   │   ├── indexing.md
│   │   └── why-trees.md
│   ├── map_blocks/
│   │   ├── map_blocks.md
│   │   └── simple_map_blocks.ipynb
│   └── parallel-intro.md
├── fundamentals/
│   ├── 01.1_creating_data_structures.ipynb
│   ├── 01.1_io.ipynb
│   ├── 01_data_structures.md
│   ├── 01_datastructures.ipynb
│   ├── 01_datatree_hierarchical_data.ipynb
│   ├── 02.1_indexing_Basic.ipynb
│   ├── 02.2_manipulating_dimensions.ipynb
│   ├── 02.3_aligning_data_objects.ipynb
│   ├── 02_labeled_data.md
│   ├── 03.1_computation_with_xarray.ipynb
│   ├── 03.2_groupby_with_xarray.ipynb
│   ├── 03.3_windowed.ipynb
│   ├── 03.4_weighted.ipynb
│   ├── 03_computation.md
│   ├── 04.0_plotting.md
│   ├── 04.1_basic_plotting.ipynb
│   ├── 04.2_faceting.ipynb
│   ├── 04.3_geographic_plotting.ipynb
│   ├── 05_intro_to_dask.ipynb
│   └── README.md
├── intermediate/
│   ├── BiologyDataset.ipynb
│   ├── computation/
│   │   ├── 01-high-level-computation-patterns.ipynb
│   │   ├── hierarchical_computation.ipynb
│   │   └── index.md
│   ├── data_cleaning/
│   │   ├── 05.1_intro.md
│   │   ├── 05.2_examples.md
│   │   ├── 05.3_ice_velocity.ipynb
│   │   ├── 05.4_contributing.md
│   │   ├── 05.5_scipy_talk.md
│   │   └── 05_data_cleaning.md
│   ├── datastructures-intermediate.ipynb
│   ├── hvplot.ipynb
│   ├── indexing/
│   │   ├── advanced-indexing.ipynb
│   │   ├── boolean-masking-indexing.ipynb
│   │   └── indexing.md
│   ├── intro-to-zarr.ipynb
│   ├── remote_data/
│   │   ├── cmip6-cloud.ipynb
│   │   ├── index.md
│   │   └── remote-data.ipynb
│   ├── storage_formats.ipynb
│   ├── xarray_and_dask.ipynb
│   └── xarray_ecosystem.ipynb
├── intro.md
├── overview/
│   ├── fundamental-path/
│   │   ├── README.md
│   │   └── index.ipynb
│   ├── get-started.md
│   ├── intermediate-path/
│   │   ├── README.md
│   │   └── index.ipynb
│   ├── learning-paths.md
│   └── xarray-in-45-min.ipynb
├── pyproject.toml
├── reference/
│   ├── glossary.md
│   ├── references.bib
│   └── resources.md
└── workshops/
    ├── oceanhackweek2020/
    │   └── README.md
    ├── online-tutorial-series/
    │   ├── 01_xarray_fundamentals.ipynb
    │   ├── 02_indexing.ipynb
    │   ├── 03_computation.ipynb
    │   └── README.md
    ├── scipy2023/
    │   ├── README.md
    │   └── index.ipynb
    ├── scipy2024/
    │   └── index.ipynb
    ├── scipy2025/
    │   └── index.ipynb
    └── thinking-like-xarray/
        └── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .binder/environment.yml
================================================
name: default
channels:
  - conda-forge
  - nodefaults
dependencies:
  - jupyter-book >=1.0.4.post1,<2
  - pre-commit >=4.2.0,<5
  - dask-labextension >=7.0.0,<8
  - jupyterlab >=4.4.4,<5
  - jupyter_bokeh >=4.0.5,<5
  - jupyterlab-myst >=2.4.2,<3
  - jupyter-resource-usage >=1.1.1,<2
  - cartopy >=0.24.0,<0.25
  - cf_xarray >=0.10.6,<0.11
  - dask >=2025.5.1,<2026
  - datashader >=0.18.1,<0.19
  - distributed >=2025.5.1,<2026
  - gcsfs >=2025.5.1,<2026
  - geoviews-core >=1.14.0,<2
  - gsw >=3.6.19,<4
  - hvplot >=0.11.3,<0.12
  - h5netcdf >=1.6.3,<2
  - ipykernel >=6.29.5,<7
  - matplotlib-base >=3.10.3,<4
  - netcdf4 >=1.7.2,<2
  - numpy >=2.2.6,<3
  - pint-xarray >=0.5.0,<0.6
  - pydap >=3.5.5,<4
  - python-graphviz >=0.21,<0.22
  - pooch >=1.8.2,<2
  - rioxarray >=0.19.0,<0.20
  - scipy >=1.16.0,<2
  - sphinx-codeautolink >=0.17.4,<0.18
  - sphinxcontrib-mermaid >=1.0.0,<2
  - sphinx-notfound-page >=1.1.0,<2
  - sphinxext-rediraffe >=0.2.7,<0.3
  - s3fs >=2025.5.1,<2026
  - xarray >=2025.7.0,<2026
  - zarr >=3.0.10,<4
  - flox >=0.10.4,<0.11
  - numbagg >=0.9.0,<0.10
  - rich >=14.0.0,<15
  - python >=3.10


================================================
FILE: .devcontainer/Dockerfile
================================================
FROM mcr.microsoft.com/devcontainers/base:noble

ARG PIXI_VERSION=v0.49.0

RUN curl -L -o /usr/local/bin/pixi -fsSL --compressed "https://github.com/prefix-dev/pixi/releases/download/${PIXI_VERSION}/pixi-$(uname -m)-unknown-linux-musl" \
    && chmod +x /usr/local/bin/pixi \
    && pixi info

# set some user and workdir settings to work nicely with vscode
USER vscode
WORKDIR /home/vscode

RUN echo 'eval "$(pixi completion -s bash)"' >> /home/vscode/.bashrc


================================================
FILE: .devcontainer/devcontainer.json
================================================
// https://pixi.sh/latest/integration/editor/vscode/#devcontainer-extension
{
  "name": "xarray-tutorial",
  "build": {
    "dockerfile": "Dockerfile",
    "context": ".."
  },
  "customizations": {
    "vscode": {
      "settings": {},
      "extensions": [
        "ms-toolsai.jupyter",
        "ms-python.python",
        "executablebookproject.myst-highlight"
      ]
    }
  },
  "features": {
    // ensure GitHub Codespace 'Open with JupyterLab' works
    // TODO: figure out why it doesn't work w/ jupyterlab in the pixi environment
    "ghcr.io/devcontainers/features/python:1": {
      "version": "3.12",
      "installTools": false,
      // NOTE: not working, so install with pip in postCreateCommand
      // "toolsToInstall":"jupyterlab_myst,pixi-kernel",
      "installJupyterlab": true
    }
  },
  "mounts": [
    "source=${localWorkspaceFolderBasename}-pixi,target=${containerWorkspaceFolder}/.pixi,type=volume"
  ],
  // These should execute in order below
  "onCreateCommand": {
    "configure_jupyterlab": "pip install jupyterlab_myst pixi-kernel"
  },
  "postCreateCommand": {
    "set_pixi_permissions": "sudo chown vscode .pixi"
  },
  "postStartCommand": {
    "configure_jupyterlab": "pixi install"
  }
}


================================================
FILE: .devcontainer/scipy2023/devcontainer.json
================================================
{
  "image": "quay.io/pangeo/pangeo-notebook:2023.07.05",
  "postCreateCommand": {
    "jupyterlab": "mkdir /home/jovyan/.jupyter && cp ${containerWorkspaceFolder}/.devcontainer/scipy2023/jupyter_lab_config.py /home/jovyan/.jupyter/jupyter_lab_config.py",
    "vscode": "mkdir ${containerWorkspaceFolder}/.vscode && cp ${containerWorkspaceFolder}/.devcontainer/scipy2023/tasks.json ${containerWorkspaceFolder}/.vscode/tasks.json"
  },
  "hostRequirements": {
    "cpus": 2
  },
  "customizations": {
    "codespaces": {
      "openFiles": ["workshops/scipy2023/README.md"]
    },
    "vscode": {
      "extensions": ["ms-toolsai.jupyter", "ms-python.python"]
    }
  }
}


================================================
FILE: .devcontainer/scipy2023/jupyter_lab_config.py
================================================
c = get_config()  # noqa
c.LabApp.default_url = '/lab/tree/workshops/scipy2023/index.ipynb'
c.ServerApp.allow_origin = '*'


================================================
FILE: .devcontainer/scipy2023/tasks.json
================================================
{
  "version": "2.0.0",
  "tasks": [
    {
      "label": "jupyterlab",
      "type": "shell",
      "command": "/srv/conda/envs/notebook/bin/jupyter lab --no-browser",
      "presentation": {
        "reveal": "always"
      },
      "runOptions": {
        "runOn": "folderOpen"
      }
    }
  ]
}


================================================
FILE: .devcontainer/scipy2024/devcontainer.json
================================================
{
  "image": "quay.io/pangeo/pangeo-notebook:2024.07.08",
  "postCreateCommand": {
    "jupyterlab": "mkdir /home/jovyan/.jupyter && cp ${containerWorkspaceFolder}/.devcontainer/scipy2024/jupyter_lab_config.py /home/jovyan/.jupyter/jupyter_lab_config.py",
    "vscode": "mkdir ${containerWorkspaceFolder}/.vscode && cp ${containerWorkspaceFolder}/.devcontainer/scipy2024/tasks.json ${containerWorkspaceFolder}/.vscode/tasks.json"
  },
  "hostRequirements": {
    "cpus": 2
  },
  "customizations": {
    "codespaces": {
      "openFiles": ["workshops/scipy2024/index.ipynb"]
    },
    "vscode": {
      "extensions": ["ms-toolsai.jupyter", "ms-python.python"]
    }
  }
}


================================================
FILE: .devcontainer/scipy2024/jupyter_lab_config.py
================================================
c = get_config()  # noqa
c.LabApp.default_url = '/lab/tree/workshops/scipy2024/index.ipynb'
c.ServerApp.allow_origin = '*'


================================================
FILE: .devcontainer/scipy2024/tasks.json
================================================
{
  "version": "2.0.0",
  "tasks": [
    {
      "label": "jupyterlab",
      "type": "shell",
      "command": "/srv/conda/envs/notebook/bin/jupyter lab --no-browser",
      "presentation": {
        "reveal": "always"
      },
      "runOptions": {
        "runOn": "folderOpen"
      }
    }
  ]
}


================================================
FILE: .devcontainer/scipy2025/Dockerfile
================================================
FROM mcr.microsoft.com/devcontainers/base:noble

ARG PIXI_VERSION=v0.49.0

RUN curl -L -o /usr/local/bin/pixi -fsSL --compressed "https://github.com/prefix-dev/pixi/releases/download/${PIXI_VERSION}/pixi-$(uname -m)-unknown-linux-musl" \
    && chmod +x /usr/local/bin/pixi \
    && pixi info

# set some user and workdir settings to work nicely with vscode
USER vscode
WORKDIR /home/vscode

RUN echo 'eval "$(pixi completion -s bash)"' >> /home/vscode/.bashrc


================================================
FILE: .devcontainer/scipy2025/devcontainer.json
================================================
// https://pixi.sh/latest/integration/editor/vscode/#devcontainer-extension
{
  "name": "scipy2025-xarray-tutorial",
  "build": {
    "dockerfile": "Dockerfile",
    "context": "../../"
  },
  "customizations": {
    "vscode": {
      "settings": {},
      "extensions": [
        "ms-toolsai.jupyter",
        "ms-python.python",
        "executablebookproject.myst-highlight"
      ]
    }
  },
  "features": {
    // ensure GitHub Codespace 'Open with JupyterLab' works
    // TODO: figure out why it doesn't work w/ jupyterlab in the pixi environment
    "ghcr.io/devcontainers/features/python:1": {
      "version": "3.12",
      "installTools": false,
      // NOTE: not working, so install with pip in postCreateCommand
      // "toolsToInstall":"jupyterlab_myst,pixi-kernel",
      "installJupyterlab": true
    }
  },
  "mounts": [
    "source=${localWorkspaceFolderBasename}-pixi,target=${containerWorkspaceFolder}/.pixi,type=volume"
  ],
  // These should execute in order below
  "onCreateCommand": {
    "configure_jupyterlab": "pip install jupyterlab_myst pixi-kernel"
  },
  "postCreateCommand": {
    "set_pixi_permissions": "sudo chown vscode .pixi"
  },
  "postStartCommand": {
    "configure_jupyterlab": "pixi install"
  }
}


================================================
FILE: .gitattributes
================================================
# SCM syntax highlighting & preventing 3-way merges
pixi.lock merge=binary linguist-language=YAML linguist-generated=true


================================================
FILE: .github/actions/setup-pixi/action.yml
================================================
name: "Setup Pixi"
description: "Create Python environment for GitHub Action Job"

runs:
  using: "composite"
  steps:
    - uses: prefix-dev/setup-pixi@v0.8.10
      with:
        manifest-path: pyproject.toml
        cache: true
        activate-environment: true


================================================
FILE: .github/dependabot.yml
================================================
# Regularly update Docker tags and Actions steps
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/.github"
    schedule:
      interval: "monthly"


================================================
FILE: .github/workflows/main.yaml
================================================
name: Deploy Website to GitHub Pages

on:
  push:
    branches: main
    paths-ignore:
      - ".devcontainer/**"

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
  contents: write
  pages: write
  id-token: write

# Allow one concurrent deployment
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Setup JupyterBook Cache
        uses: actions/cache@v4
        with:
          path: _build
          # NOTE: change key to "jupyterbook-DATE" to force rebuilding cache
          key: jupyterbook-20250701

      - uses: ./.github/actions/setup-pixi

      - name: Build JupyterBook
        run: |
          jupyter-book build ./ --warningiserror --keep-going

      - name: Dump Build Logs
        if: always()
        run: |
          if (test -a _build/html/reports/*log); then cat _build/html/reports/*log ; fi

      - name: Save Build Folder
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: build
          path: _build/

      - name: Upload Pages Artifact
        uses: actions/upload-pages-artifact@v3
        with:
          path: _build/html

  # Publish Website to GitHub Pages if built successfully
  deploy:
    needs: build
    if: github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}

    steps:
      - name: Setup Pages
        uses: actions/configure-pages@v5

      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4


================================================
FILE: .github/workflows/nocache.yaml
================================================
name: Rebuild Entire Jupyter Book on all Platforms

on:
  workflow_dispatch:

# Allow one concurrent deployment
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  build:
    name: Build on ${{ matrix.runs-on }}
    runs-on: ${{ matrix.runs-on }}
    strategy:
      fail-fast: false
      matrix:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - uses: ./.github/actions/setup-pixi

      # https://github.com/xarray-contrib/xarray-tutorial/issues/311
      - name: Configure graphviz
        if: matrix.runs-on == 'macos-latest'
        run: |
          dot -c

      - name: Build JupyterBook
        id: jb-build
        continue-on-error: true
        run: |
          jupyter-book build ./ --warningiserror --keep-going

      - name: Dump Build Logs
        if: steps.jb-build.outcome == 'failure'
        run: |
          cat _build/html/reports/**/*.log


================================================
FILE: .github/workflows/pull_request.yaml
================================================
name: Pull Request Build

on:
  pull_request:
    types: [opened, synchronize, reopened, closed]
    paths-ignore:
      - ".devcontainer/**"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  preview:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        if: github.event.action != 'closed'
        uses: actions/checkout@v4

      - name: Setup JupyterBook Cache
        if: github.event.action != 'closed'
        uses: actions/cache@v4
        with:
          path: _build
          # NOTE: change key to "jupyterbook-DATE" to force rebuilding cache
          key: jupyterbook-20250701

      - uses: ./.github/actions/setup-pixi

      - name: Build JupyterBook
        if: github.event.action != 'closed'
        run: |
          jupyter-book build ./ --warningiserror --keep-going

      - name: Dump Build Logs
        if: github.event.action != 'closed'
        run: |
          if (test -a _build/html/reports/*log); then cat _build/html/reports/*log ; fi

      - name: Upload artifact
        if: github.event.action != 'closed'
        uses: actions/upload-artifact@v4
        with:
          name: html
          path: _build/html


================================================
FILE: .github/workflows/qaqc.yaml
================================================
name: QualityContol

on:
  workflow_dispatch:
  pull_request:
    branches:
      - main
    paths-ignore:
      - ".devcontainer/**"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  quality-control:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v4

      - uses: ./.github/actions/setup-pixi

      # NOTE: this isn't a comprehensive spellcheck, just common typos
      - name: Spellcheck
        if: always()
        uses: codespell-project/actions-codespell@v2
        with:
          check_filenames: true
          check_hidden: true
          skip: ".git,*.js,qaqc.yml"
          ignore_words_list: hist,nd

      # borrowed from https://github.com/ProjectPythia/pythia-foundations/blob/main/.github/workflows/link-checker.yaml
      - name: Disable Notebook Execution Before Linkcheck
        if: always()
        shell: python
        run: |
          import yaml
          with open('./_config.yml') as f:
            data = yaml.safe_load(f)
            data['execute']['execute_notebooks'] = 'off'
          with open('./_config.yml', 'w') as f:
            yaml.dump(data, f)

      # Checking links is flaky, so continue-on-error: true
      - name: Check External Links
        timeout-minutes: 5
        continue-on-error: true
        if: always()
        run: |
          jupyter-book build ./ --builder linkcheck


================================================
FILE: .github/workflows/surge_preview.yml
================================================
name: Pull Request Preview

on:
  workflow_run:
    workflows: ["Pull Request Build"]
    types:
      - completed

permissions:
  pull-requests: write # allow surge-preview to create/update PR comments

concurrency:
  group: ${{ github.workflow }}-${{ github.event.workflow_run.id }}
  cancel-in-progress: true

jobs:
  # NOTE: match job name in pull_request.yaml
  preview:
    runs-on: ubuntu-latest
    if: ${{ github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'success' }}

    steps:
      # Ensure folder exists for PR 'closed' case
      - run: mkdir html

      # Download built HTML from PR Build workflow
      - uses: actions/download-artifact@v4
        continue-on-error: true
        with:
          github-token: ${{ github.token }}
          run-id: ${{ github.event.workflow_run.id }}

      - name: Manage Surge.sh Deployment
        id: preview_step
        uses: afc163/surge-preview@v1
        with:
          surge_token: ${{ secrets.SURGE_TOKEN }}
          github_token: ${{ secrets.GITHUB_TOKEN }}
          build: echo 'Uploading html/ folder contents to Surge.sh...'
          dist: html # NOTE: match upload_artifact name in pull_request.yaml
          failOnError: true
          teardown: true


================================================
FILE: .gitignore
================================================
# project/repo specific
conf.py
advanced/backends/*.bin
scipy-tutorial/dask-report-large-chunk.html
mydask.png
dask-report.html
_build/
*.zarr
*.nc
*.tiff
*.tif
dask-worker-space/
.jupyter_cache

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# misc
.DS_Store
.vscode/

# pixi environments
.pixi
*.egg-info


================================================
FILE: .pre-commit-config.yaml
================================================
ci:
  autoupdate_schedule: monthly

repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-docstring-first
      - id: check-json
        exclude: ^.devcontainer/
      - id: check-yaml
      - id: double-quote-string-fixer

  - repo: https://github.com/codespell-project/codespell
    rev: "v2.4.1"
    hooks:
      - id: codespell

  - repo: https://github.com/psf/black-pre-commit-mirror
    rev: 26.1.0
    hooks:
      - id: black
      - id: black-jupyter

  - repo: https://github.com/keewis/blackdoc
    rev: v0.4.6
    hooks:
      - id: blackdoc

  - repo: https://github.com/PyCQA/flake8
    rev: 7.3.0
    hooks:
      - id: flake8

  - repo: https://github.com/PyCQA/isort
    rev: 8.0.1
    hooks:
      - id: isort

  - repo: https://github.com/pre-commit/mirrors-prettier
    rev: v4.0.0-alpha.8
    hooks:
      - id: prettier

  - repo: https://github.com/kynan/nbstripout
    rev: 0.9.1
    hooks:
      - id: nbstripout
        args: [--extra-keys=metadata.kernelspec metadata.language_info.version]


================================================
FILE: .prettierrc.toml
================================================
tabWidth = 2
semi = false
singleQuote = true


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing Guide

This tutorial repository is a great opportunity to start contributing to Xarray.

- Report bugs, request features or submit feedback as a [GitHub Issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/about-issues). First check existing [issues](https://github.com/xarray-contrib/xarray-tutorial/issues) !

- Make fixes, add content or improvements using [GitHub Pull Requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests), the sections below go over this process in more detail:

```{seealso}
The Project Pythia Foundations material on  [Github](https://foundations.projectpythia.org/foundations/getting-started-github.html) and Github workflows is a great place to start if you are new to this.
```

## Content Guidelines

Please note that examples submitted to this repository should follow these
guidelines:

1. Run top-to-bottom without intervention from the user
1. Not require external data sources that may disappear over time (external data sources that are highly unlikely to disappear are fine). Small datasets for tutorial purposes can be added [here](https://github.com/pydata/xarray-data/) if necessary.
1. Not be resource intensive, and should run within 2GB of memory
1. Be clear and contain enough prose to explain the topic at hand
1. Be concise and limited to one or two topics, such that a reader can get through the example within a few minutes of reading
1. Be of general relevance to Xarray users, and so not too specific on a particular problem or use case.

## Contribution process

### Fork this repository

We recommend first forking this repository and creating a local copy:

```
git clone https://github.com/YOURACCOUNT/xarray-tutorial.git
cd xarray-tutorial
```

### Create a Python environment

You'll need `pixi` or `conda` or `mamba`, which can be installed from https://github.com/conda-forge/miniforge

We also use [pre-commit hooks](https://pre-commit.com) to run styling and other checks before committing code.

#### Using pixi (recommended)

```
pixi install
pixi shell  # exit
```

#### Using conda

```
mamba env create -f .binder/environment.yml -n xarray-tutorial
conda activate xarray-tutorial  # conda deactivate
pre-commit install
```

### Add content

Develop your new content on a branch. See [JupyterBook Docs](https://jupyterbook.org/en/stable/intro.html) for guides on adding `.md`, `.ipynb` and other content.

```
git checkout -b newcontent
git add .
git commit -m "added pages x,y and improved z"
```

### Preview your changes

Running jupyterbook will execute notebooks and render HTML pages for the website. Be sure to fix any execution errors and preview the website in your web browser to make sure everything looks good!

```
jupyter-book build ./ --warningiserror --keep-going
# Or "pixi run build"
```

### Open a pull request

```
git push
```

Follow the link reported in a terminal to open a pull request!

## Instructions for environment management

[`pixi`](https://pixi.sh) can be used to create and update a multi-platform lockfile, so a reproducible set of package versions is installed across different operating systems.

Dependencies (with optional pins) are specified in the `pyproject.toml` file, and specific locked versions for all platforms are kept in `pixi.lock`.

Install environment from the lockfile

```
pixi install
pixi shell # activate environment, "exit" to deactivate
```

Upgrade all packages to latest versions:

```
pixi upgrade
```

## Render conda/mamba environment files

```
pixi project export conda-environment -p linux-64 .binder/environment.yml
pixi project export conda-explicit-spec -p linux-64 /tmp
```


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        https://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       https://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Xarray Tutorial

[![Deploy Website to GitHub Pages](https://github.com/xarray-contrib/xarray-tutorial/actions/workflows/main.yaml/badge.svg)](https://github.com/xarray-contrib/xarray-tutorial/actions/workflows/main.yaml)
[![Jupyter Book Badge](https://jupyterbook.org/badge.svg)](https://tutorial.xarray.dev)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/xarray-contrib/xarray-tutorial/HEAD?labpath=workshops/scipy2025/index.ipynb)

This is the repository for a Jupyter Book website with tutorial material for [Xarray](https://github.com/pydata/xarray), _an open source project and Python package that makes working with labelled multi-dimensional arrays simple, efficient, and fun!_

The website is hosted at https://tutorial.xarray.dev

Tutorials are written as interactive Jupyter Notebooks with executable code examples that you can easily run and modify:

#### On the Cloud

All notebooks can be run via the Mybinder.org 'Launch Binder' badge at the top of this page. This will load a pre-configured JupyterLab interface with all tutorial notebooks for you to run. _You have minimal computing resources and any changes you make will not be saved._

#### Github Codespaces

This tutorial is available to run within [Github Codespaces](https://github.com/features/codespaces) - "a development environment that's hosted in the cloud"

[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/xarray-contrib/xarray-tutorial/tree/main)

☝️ Click the button above to go to options window to launch a Github codespace.

GitHub currently gives every user [120 vCPU hours per month for free](https://docs.github.com/en/billing/managing-billing-for-github-codespaces/about-billing-for-github-codespaces#monthly-included-storage-and-core-hours-for-personal-accounts), beyond that you must pay. **So be sure to explicitly stop or shut down your codespace when you are done by going to this page (https://github.com/codespaces).**

You can also use the GitHub CLI to launch a codespace

```
# This will output a URL to use VSCode in the browser
gh codespace create --repo xarray-contrib/xarray-tutorial
# Optionally launch JupyterLab instead of vscode (after codespace has been created)
gh codespace jupyter
```

#### Locally

You can also run these notebooks on your own computer! We recommend using [`pixi`](https://pixi.sh/latest/#installation) to ensure a fully reproducible Python environment:

```bash
git clone https://github.com/xarray-contrib/xarray-tutorial.git
cd xarray-tutorial
pixi run tutorial
```

### Building the Documentation Locally

To build and serve the tutorial website locally with live reload:

```bash
pixi run watch
```

This watches for changes, rebuilds, and serves at http://localhost:8000.

To build without serving:

```bash
pixi run build
```

## Contributing

Contributions are welcome and greatly appreciated! See our [CONTRIBUTING.md](./CONTRIBUTING.md) document.

Thanks to our contributors so far!

[![Contributors](https://contrib.rocks/image?repo=xarray-contrib/xarray-tutorial)](https://github.com/xarray-contrib/xarray-tutorial/graphs/contributors)

## Acknowledgements

This website is the result of many contributions from the Xarray community! We're very grateful for everyone's volunteered effort as well as [sponsored development](https://xarray.dev/#sponsors). Funding for SciPy 2022, SciPy 2023 tutorial material development specifically was supported by NASA's Open Source Tools, Frameworks, and Libraries Program (award 80NSSC22K0345).


================================================
FILE: _config.yml
================================================
# Learn more at https://jupyterbook.org/customize/config.html
title: ""
author: The Xarray Community
copyright: "2025"
logo: images/logo.png
only_build_toc_files: true
exclude_patterns: [.github, .pixi]

# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
html:
  # NOTE: this announcement shows up on all pages
  #announcement: '<a href="https://forms.gle/KEq7WviCdz9xTaJX6">The Xarray 2024 User Survey is live. Please take ~5 minutes to fill it out and help us improve Xarray.</a>'
  #announcement: 'ℹ️ SciPy 2025 Tutorial Attendees. <a href="https://tutorial.xarray.dev/workshops/scipy2025/index.html">Click here! </a>.'
  home_page_in_navbar: false
  use_edit_page_button: true
  use_issues_button: true
  use_repository_button: true
  extra_footer: '<p>Xarray is a fiscally sponsored project of <a href="https://numfocus.org">NumFOCUS</a>, a nonprofit dedicated to supporting the open-source scientific computing community.<br> Theme by the <a href="https://ebp.jupyterbook.org">Executable Book Project</a>.</p> Content licensed under the terms of the <a href="https://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0 License</a>.'
  analytics:
    google_analytics_id: G-JRQHYVFQR7
  favicon: "images/favicon.ico"

parse:
  # https://jupyterbook.org/content/content-blocks.html?highlight=myst%20substitution#define-substitutions-for-your-whole-book
  # https://jupyterbook.org/content/content-blocks.html#using-substitutions-in-links
  myst_substitutions:
    xarray_homepage: https://xarray.dev
    xarray_docs: https://docs.xarray.dev
    xarray_repo: https://github.com/pydata/xarray
    xarray_forum: https://github.com/pydata/xarray/discussions
  myst_enable_extensions:
    # Defaults
    - dollarmath
    - linkify
    - substitution
    - colon_fence
    # Extras
    - html_image

# Force re-execution of notebooks on each build.
# See https://jupyterbook.org/content/execute.html
execute:
  execute_notebooks: "cache"
  allow_errors: false
  # Per-cell notebook execution limit (seconds)
  timeout: 300

# Define the name of the latex output file for PDF builds
latex:
  latex_documents:
    targetname: book.tex

# Configure your Binder links, such as the URL of the BinderHub.
launch_buttons:
  notebook_interface: jupyterlab
  binderhub_url: "https://mybinder.org"

# Information about where the book exists on the web
repository:
  url: "https://github.com/xarray-contrib/xarray-tutorial"
  branch: main

# Bibliography
bibtex_bibfiles:
  - reference/references.bib

# Advanced configuration
sphinx:
  extra_extensions:
    # 404 not found page
    - notfound.extension
    # maintain old paths and redirect them (so google results dont go to 404)
    # https://github.com/wpilibsuite/sphinxext-rediraffe
    - sphinxext.rediraffe
    - sphinx_codeautolink
    - sphinxcontrib.mermaid

  config:
    language: en # accessibility
    # application/vnd.holoviews_load.v0+json, application/vnd.holoviews_exec.v0+json
    suppress_warnings: ["mystnb.unknown_mime_type", "misc.highlighting_failure"]
    codeautolink_concat_default: True
    notfound_context:
      body: "<h1>Whoops! 404 Page Not Found</h1>\n\n<p>Sorry, this page doesn't exist. Many sections of this book have been updated recently.</p><p> Try the search box 🔎 to find what you're looking for!</p>"
    notfound_urls_prefix: /
    rediraffe_redirects:
      scipy-tutorial/00_overview.ipynb: overview/get-started.md
      workshops/scipy2022/README.md: overview/fundamental-path/README.md
      fundamentals/02.1_working_with_labeled_data.ipynb: fundamentals/02.1_indexing_Basic.ipynb

    bibtex_reference_style: author_year # or label, super, \supercite

    intersphinx_mapping:
      xarray:
        - https://docs.xarray.dev/en/stable
        - null
      numpy:
        - https://numpy.org/doc/stable
        - null
      pandas:
        - https://pandas.pydata.org/docs
        - null
      scipy:
        - https://docs.scipy.org/doc/scipy
        - null
      python:
        - https://docs.python.org/3
        - null
      dask:
        - https://docs.dask.org/en/stable
        - null
      matplotlib:
        - https://matplotlib.org/stable
        - null


================================================
FILE: _static/style.css
================================================
.bd-header-announcement {
  background-color: var(--pst-color-accent);
}

/* workaround Pydata Sphinx theme using light colors for widget cell outputs in dark-mode */
/* works for many widgets but not for Xarray html reprs */
/* https://github.com/pydata/pydata-sphinx-theme/issues/2189 */
html[data-theme="dark"] div.cell_output .text_html:has(div.xr-wrap) {
  background-color: var(--pst-color-on-background) !important;
  color: var(--pst-color-text-base) !important;
}


================================================
FILE: _toc.yml
================================================
# Learn more at https://jupyterbook.org/customize/toc.html
root: intro
format: jb-book
parts:
  - caption: Overview
    chapters:
      - file: overview/get-started.md
      - file: overview/xarray-in-45-min
      - file: overview/learning-paths.md
        sections:
          - file: overview/fundamental-path/README.md
          - file: overview/intermediate-path/README.md

  - caption: Fundamentals
    chapters:
      - file: fundamentals/01_data_structures.md
        sections:
          - file: fundamentals/01_datastructures
          - file: fundamentals/01.1_creating_data_structures
          - file: fundamentals/01.1_io
          - file: fundamentals/01_datatree_hierarchical_data.ipynb
      - file: fundamentals/02_labeled_data.md
        sections:
          - file: fundamentals/02.1_indexing_Basic.ipynb
          - file: fundamentals/02.2_manipulating_dimensions
      - file: fundamentals/03_computation.md
        sections:
          - file: fundamentals/03.1_computation_with_xarray
          - file: fundamentals/02.3_aligning_data_objects
          - file: fundamentals/03.2_groupby_with_xarray
          - file: fundamentals/03.3_windowed
          - file: fundamentals/03.4_weighted
      - file: fundamentals/04.0_plotting.md
        sections:
          - file: fundamentals/04.1_basic_plotting
          - file: fundamentals/04.2_faceting
          - file: fundamentals/04.3_geographic_plotting

  - caption: Intermediate
    chapters:
      - file: intermediate/computation/index
        sections:
          - file: intermediate/computation/01-high-level-computation-patterns.ipynb
          - file: intermediate/computation/hierarchical_computation.ipynb
      - file: intermediate/indexing/indexing
        sections:
          - file: intermediate/indexing/advanced-indexing.ipynb
          - file: intermediate/indexing/boolean-masking-indexing.ipynb
      - file: intermediate/xarray_and_dask
      - file: intermediate/intro-to-zarr.ipynb
      - file: intermediate/storage_formats.ipynb
      - file: intermediate/xarray_ecosystem
      - file: intermediate/hvplot
      - file: intermediate/datastructures-intermediate.ipynb
      - file: intermediate/BiologyDataset.ipynb
      - file: intermediate/remote_data/index
        sections:
          - file: intermediate/remote_data/cmip6-cloud.ipynb
          - file: intermediate/remote_data/remote-data.ipynb
      - file: intermediate/data_cleaning/05.1_intro.md
        sections:
          - file: intermediate/data_cleaning/05.2_examples.md
          - file: intermediate/data_cleaning/05.3_ice_velocity
          - file: intermediate/data_cleaning/05.4_contributing.md
          - file: intermediate/data_cleaning/05.5_scipy_talk.md

  - caption: Advanced
    chapters:
      - file: advanced/indexing/indexing.md
        sections:
          - file: advanced/indexing/why-trees.md
      - file: advanced/parallel-intro.md
      - file: advanced/apply_ufunc/apply_ufunc.md
        sections:
          - file: advanced/apply_ufunc/simple_numpy_apply_ufunc
          - file: advanced/apply_ufunc/core-dimensions
          - file: advanced/apply_ufunc/complex-output-numpy
          - file: advanced/apply_ufunc/automatic-vectorizing-numpy
          - file: advanced/apply_ufunc/dask_apply_ufunc
          - file: advanced/apply_ufunc/numba-vectorization
          - file: advanced/apply_ufunc/example-interp
      - file: advanced/map_blocks/map_blocks.md
        sections:
          - file: advanced/map_blocks/simple_map_blocks
      - file: advanced/backends/backends.md
        sections:
          - file: advanced/backends/1.Backend_without_Lazy_Loading.ipynb
          - file: advanced/backends/2.Backend_with_Lazy_Loading.ipynb
      - file: advanced/accessors/accessors.md
        sections:
          - file: advanced/accessors/01_accessor_examples.ipynb

  - caption: Workshops
    chapters:
      - file: workshops/scipy2025/index.ipynb
      - file: workshops/scipy2024/index.ipynb
      - file: workshops/scipy2023/README
      - file: workshops/thinking-like-xarray/README
        sections:
          - url: https://tutorial.xarray.dev/intermediate/01-high-level-computation-patterns
            title: High-level computation patterns
      - file: workshops/oceanhackweek2020/README
        sections:
          - url: https://tutorial.xarray.dev/overview/xarray-in-45-min
            title: Xarray in 45 minutes
      - file: workshops/online-tutorial-series/README
        sections:
          - file: workshops/online-tutorial-series/01_xarray_fundamentals
          - file: workshops/online-tutorial-series/02_indexing
          - file: workshops/online-tutorial-series/03_computation

  - caption: Reference
    chapters:
      - file: CONTRIBUTING
      - file: reference/resources
      - file: reference/glossary


================================================
FILE: advanced/accessors/01_accessor_examples.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating custom accessors"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction\n",
    "\n",
    "An accessor is a way of attaching a custom function to xarray objects so that it can be called as if it were a method while retaining a clear separation between the \"core\" xarray API and custom API. It enables you to easily *extend* (which is why you'll sometimes see it referred to as an extension) and customize xarray's functionality while limiting naming conflicts and minimizing the chances of your code breaking with xarray upgrades.\n",
    "\n",
    "If you've used [rioxarray](https://corteva.github.io/rioxarray/stable/) (e.g. `da.rio.crs`) or [hvplot](https://hvplot.holoviz.org/) (e.g. `ds.hvplot()`), you may have already used an xarray accessor without knowing it!\n",
    "\n",
    "The [Xarray documentation](https://docs.xarray.dev/en/stable/internals/extending-xarray.html) has some more technical details, and this tutorial provides example custom accessors and their uses."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Why create a custom accessor\n",
    "\n",
    "- You can easily create a custom suite of tools that work on Xarray objects\n",
    "- It keeps your workflows cleaner and simpler\n",
    "- Your project-specific code is easy to share\n",
    "- It's easy to implement: you don't need to integrate any code into Xarray\n",
    "- It makes it easier to perform checks and write code documentation because you only have to create them once!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Easy steps to create your own accessor\n",
    "\n",
    "1. Create your custom class, including the mandatory `__init__` method\n",
    "2. Add the `xr.register_dataarray_accessor()` or `xr.register_dataset_accessor()` \n",
    "3. Use your custom functions "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example 1: accessing scipy functionality"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For example, imagine you're a statistician who regularly uses a special `skewness` function which acts on dataarrays but is only of interest to people in your specific field.\n",
    "\n",
    "You can create a method which applies this skewness function to an xarray object and then register the method under a custom `stats` accessor like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xarray as xr\n",
    "from scipy.stats import skew\n",
    "\n",
    "xr.set_options(display_expand_attrs=False, display_expand_coords=False)\n",
    "\n",
    "\n",
    "@xr.register_dataarray_accessor(\"stats\")\n",
    "class StatsAccessor:\n",
    "    def __init__(self, da):\n",
    "        self._da = da\n",
    "\n",
    "    def skewness(self, dim):\n",
    "        return self._da.reduce(func=skew, dim=dim)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can conveniently access this functionality via the `stats` accessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = xr.tutorial.load_dataset(\"air_temperature\")\n",
    "ds[\"skewair\"] = ds['air'].stats.skewness(dim=\"time\")\n",
    "ds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice how the presence of `.stats` clearly differentiates our new \"accessor method\" from core xarray methods."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example 2: creating your own workflows\n",
    "\n",
    "Perhaps you find yourself running similar code for multiple xarray objects or across related projects. By packing your code into an extension, it makes it easy to repeat the same operation while reducing the likelihood of [human introduced] errors.\n",
    "\n",
    "Here we wrap the reorganization of InSAR ice velocity data illustrated in [this tutorial](https://tutorial.xarray.dev/intermediate/data_cleaning/05.3_ice_velocity.html) into a custom Xarray extension that makes it easy to re-apply each time you begin working with a new InSAR velocity dataset. Please see the linked tutorial for details on the data, applications, and each step in this process."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import xarray as xr\n",
    "\n",
    "\n",
    "@xr.register_dataset_accessor(\"insar_vel\")\n",
    "class InsarReorg:\n",
    "    \"\"\"\n",
    "    An extension for an XArray dataset that will prepare InSAR data for analysis.\n",
    "\n",
    "    Re-organize the data from its native structure to have x and y velocity and error along a time dimension.\n",
    "    \"\"\"\n",
    "\n",
    "    # ----------------------------------------------------------------------\n",
    "    # Constructors\n",
    "\n",
    "    def __init__(self, xrds):\n",
    "        self._xrds = xrds\n",
    "\n",
    "    # ----------------------------------------------------------------------\n",
    "    # Methods\n",
    "\n",
    "    @staticmethod\n",
    "    def _validate(self, req_dim=None, req_vars=None):\n",
    "        '''\n",
    "        Make sure the xarray dataset has the correct dimensions and variables.\n",
    "\n",
    "        Running this function will check that my dataset has all the needed dimensions and variables\n",
    "        for a given function, saving time and headache later if they were missing and the computation fails\n",
    "        partway through.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        req_dim : list of str\n",
    "            List of all required dimension names\n",
    "        req_vars : list of str\n",
    "            List of all required variable  names\n",
    "        '''\n",
    "\n",
    "        if req_dim is not None:\n",
    "            if all([dim not in list(self._xrds.dims) for dim in req_dim]):\n",
    "                raise AttributeError(\"Required dimensions are missing\")\n",
    "        if req_vars is not None:\n",
    "            if all([var not in self._xrds.variables for var in req_vars.keys()]):\n",
    "                raise AttributeError(\"Required variables are missing\")\n",
    "        # print(\"successfully validated your dataset\")\n",
    "\n",
    "    # ----------------------------------------------------------------------\n",
    "    # Functions\n",
    "\n",
    "    def change_vars_to_coords(\n",
    "        self,\n",
    "        req_dim=['ny', 'nx'],\n",
    "        req_vars={'xaxis': ['nx'], 'yaxis': ['ny']},\n",
    "    ):\n",
    "        \"\"\"\n",
    "        Turn the xaxis and y axis variables into coordinates.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        req_dim : list of str\n",
    "            List of all required dimension names.\n",
    "        req_vars : list of str\n",
    "            List of all required variable  names\n",
    "        \"\"\"\n",
    "\n",
    "        self._validate(self, req_dim, req_vars)\n",
    "\n",
    "        self._xrds = self._xrds.swap_dims({'ny': 'yaxis', 'nx': 'xaxis'})\n",
    "        self._xrds = self._xrds.rename({'xaxis': 'x', 'yaxis': 'y'})\n",
    "\n",
    "        return self._xrds\n",
    "\n",
    "    def reorg_dataset(self):\n",
    "        \"\"\"\n",
    "        Reorganize the data by time for each of the desired end variables (here vx, vy, err)\n",
    "\n",
    "        \"\"\"\n",
    "\n",
    "        reorged = []\n",
    "        for reorg_var in ['vx', 'vy', 'err']:\n",
    "            ds = self.reorg_var_time(reorg_var)\n",
    "            reorged.append(ds)\n",
    "\n",
    "        reorged_ds = xr.merge(reorged)\n",
    "\n",
    "        return reorged_ds\n",
    "\n",
    "    def reorg_var_time(self, reorg_var):\n",
    "        \"\"\"\n",
    "        Repeat the process for a given variable.\n",
    "\n",
    "        Figure out which of the original variables are time steps for this variable and turn each one into a dataarray.\n",
    "        Add a time dimension and update the variable name for each dataarray.\n",
    "        Combine the modified data arrays back into a single dataset.\n",
    "        \"\"\"\n",
    "\n",
    "        # create storage list for reorganizing\n",
    "        var_ls = list(self._xrds)\n",
    "        to_reorg = [var for var in var_ls if reorg_var in var]\n",
    "\n",
    "        # list the arrays from the original dataset that correspond to the variable\n",
    "        das_to_reorg = [self._xrds[var] for var in to_reorg]\n",
    "\n",
    "        # add the time dimension\n",
    "        das_to_reorg = [das_to_reorg[var].expand_dims('time') for var in range(len(das_to_reorg))]\n",
    "\n",
    "        # update variable name to remove time\n",
    "        das_to_reorg = [das_to_reorg[var].rename(reorg_var) for var in range(len(das_to_reorg))]\n",
    "\n",
    "        ds = xr.concat(das_to_reorg, dim='time')\n",
    "\n",
    "        return ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = xr.tutorial.open_dataset('ASE_ice_velocity.nc')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = ds.insar_vel.change_vars_to_coords()\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = ds.insar_vel.reorg_dataset()\n",
    "ds"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example 3: creating your own workflows with locally stored corrections\n",
    "\n",
    "Consider someone who frequently converts their elevations to be relative to the geoid (rather than the ellipsoid) using a custom, local conversion (otherwise, we'd recommend using an established conversion library like [pyproj](https://pypi.org/project/pyproj/) to switch between datums).\n",
    "\n",
    "An accessor provides an elegant way to build (once) and apply (as often as needed!) this custom conversion on top of the existing xarray ecosystem without the need to copy-paste the code into the start of each project. By standardizing our approach and adding a few sanity checks within the accessor, we also eliminate the risk of accidentally applying the correction multiple times."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import rasterio\n",
    "import xarray as xr\n",
    "\n",
    "\n",
    "@xr.register_dataset_accessor(\"geoidxr\")\n",
    "class GeoidXR:\n",
    "    \"\"\"\n",
    "    An extension for an XArray dataset that will calculate geoidal elevations from a local source file.\n",
    "    \"\"\"\n",
    "\n",
    "    # ----------------------------------------------------------------------\n",
    "    # Constructors\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        xrds,\n",
    "    ):\n",
    "        self._xrds = xrds\n",
    "        # Running this function on init will check that my dataset has all the needed dimensions and variables\n",
    "        # as specific to my workflow, saving time and headache later if they were missing and the computation fails\n",
    "        # partway through.\n",
    "        self._validate(\n",
    "            self, req_dim=['x', 'y', 'dtime'], req_vars={'elevation': ['x', 'y', 'dtime']}\n",
    "        )\n",
    "\n",
    "    # ----------------------------------------------------------------------\n",
    "    # Methods\n",
    "\n",
    "    @staticmethod\n",
    "    def _validate(self, req_dim=None, req_vars=None):\n",
    "        '''\n",
    "        Make sure the xarray dataset has the correct dimensions and variables\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        req_dim : list of str\n",
    "            List of all required dimension names\n",
    "        req_vars : list of str\n",
    "            List of all required variable  names\n",
    "        '''\n",
    "\n",
    "        if req_dim is not None:\n",
    "            if all([dim not in list(self._xrds.dims) for dim in req_dim]):\n",
    "                raise AttributeError(\"Required dimensions are missing\")\n",
    "        if req_vars is not None:\n",
    "            if all([var not in self._xrds.variables for var in req_vars.keys()]):\n",
    "                raise AttributeError(\"Required variables are missing\")\n",
    "\n",
    "    # Notice that 'geoid' has been added to the req_vars list\n",
    "    def to_geoid(\n",
    "        self,\n",
    "        req_dim=['dtime', 'x', 'y'],\n",
    "        req_vars={'elevation': ['x', 'y', 'dtime', 'geoid']},\n",
    "        source=None,\n",
    "    ):\n",
    "        \"\"\"\n",
    "        Get geoid layer from your local file, which is provided to the function as \"source\",\n",
    "        and apply the offset to all elevation values.\n",
    "        Adds 'geoid_offset' keyword to \"offsets\" attribute so you know the geoid offset was applied.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        req_dim : list of str\n",
    "            List of all required dimension names.\n",
    "        req_vars : list of str\n",
    "            List of all required variable  names\n",
    "        source : str\n",
    "            Full path to your source file containing geoid offsets\n",
    "        \"\"\"\n",
    "\n",
    "        # check to make sure you haven't already run this function (and are thus applying the offset twice)\n",
    "        try:\n",
    "            values = self._xrds.attrs['offset_names']\n",
    "            assert 'geoid_offset' not in values, \"You've already applied the geoid offset!\"\n",
    "            values = list([values]) + ['geoid_offset']\n",
    "        except KeyError:\n",
    "            values = ['geoid_offset']\n",
    "\n",
    "        self._validate(self, req_dim, req_vars)\n",
    "\n",
    "        # read in your geoid values\n",
    "        # WARNING: this implementation assumes your geoid values are in the same CRS and grid as the data you are applying\n",
    "        # them to. If not, you will need to reproject and/or resample them to match the data to which you are applying them.\n",
    "        # That step is not included here to emphasize the accessor aspect of the workflow.\n",
    "        with rasterio.open(source) as src:\n",
    "            geoid = src['geoid_varname']\n",
    "\n",
    "        # As noted above, this step will fail or produce unreliable results if your data is not properly gridded\n",
    "        self._xrds['elevation'] = self._xrds.elevation - geoid\n",
    "\n",
    "        self._xrds.attrs['offset_names'] = values\n",
    "\n",
    "        return self._xrds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, each time we want to convert our ellipsoid data to the geoid, we only have to run one line of code, and it will also perform a multitude of checks for us to make sure we're performing exactly the operation we expect. Imagine the possibilities (and decrease in frustration)!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "ds = ds.geoidxr.to_geoid(source='/Path/to/Custom/source/file.nc')\n",
    "ds"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  },
  "vscode": {
   "interpreter": {
    "hash": "eeef546aa85c5aee566c457bd2890cafb9e11a3b514b94bbf230bf44d1caf251"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: advanced/accessors/accessors.md
================================================
```{tableofcontents}

```


================================================
FILE: advanced/apply_ufunc/apply_ufunc.md
================================================
# apply_ufunc

```{tableofcontents}

```


================================================
FILE: advanced/apply_ufunc/automatic-vectorizing-numpy.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "(vectorize)=\n",
    "# Automatic Vectorization"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {
    "tags": []
   },
   "source": [
    "Previously we looked at [applying functions](gentle-intro) on numpy arrays, and the concept of [core dimensions](core-dimensions).\n",
    "We learned that functions commonly support specifying \"core dimensions\" through the `axis` keyword\n",
    "argument. \n",
    "\n",
    "However many functions exist, that implicitly have core dimensions, but do not provide an `axis` keyword\n",
    "argument. Applying such functions to a nD array usually involves one or multiple loops over the other dimensions\n",
    "--- termed \"loop dimensions\" or \"broadcast dimensions\".\n",
    "\n",
    "\n",
    "A good example is numpy's 1D interpolate function `numpy.interp`:\n",
    "\n",
    "```\n",
    "    Signature: np.interp(x, xp, fp, left=None, right=None, period=None)\n",
    "    Docstring:\n",
    "        One-dimensional linear interpolation.\n",
    "\n",
    "    Returns the one-dimensional piecewise linear interpolant to a function\n",
    "    with given discrete data points (`xp`, `fp`), evaluated at `x`.\n",
    "```\n",
    "\n",
    "This function expects 1D arrays as input, so there is one core dimension and we cannot easily apply \n",
    "it to a nD array since there is no `axis` keyword argument. \n",
    "\n",
    "\n",
    "Our goal here is to \n",
    "1. Understand the difference between core dimensions and loop dimensions\n",
    "1. Understand vectorization\n",
    "1. Learn how to apply such functions without loops using `apply_ufunc` by providing the `vectorize` keyword argument.\n",
    "\n",
    "## Core dimensions and looping\n",
    "\n",
    "Let's say we want to\n",
    "interpolate an array with two dimensions (`space`, `time`) over the `time` dimension, we might \n",
    "1. loop over the `space` dimension, \n",
    "1. subset the array to a 1D array at that `space` location, \n",
    "1. Interpolate the 1D arrays to the new `time` vector, and\n",
    "1. Assign that new interpolated 1D array to the appropriate location of a 2D output array\n",
    "\n",
    "In pseudo-code this might look like\n",
    "\n",
    "```python\n",
    "for index in range(size_of_space_axis):\n",
    "    out[index, :] = np.interp(..., array[index, :], ...)\n",
    "```\n",
    "\n",
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "Consider the example problem of interpolating a 2D array with dimensions `space` and `time` along the `time` dimension.\n",
    "Which dimension is the core dimension, and which is the \"loop dimension\"?\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "`time` is the core dimension, and `space` is the loop dimension.\n",
    ":::\n",
    "::::\n",
    "\n",
    "## Vectorization\n",
    "\n",
    "The pattern of looping over any number of \"loop dimensions\" and applying a function along \"core dimensions\" \n",
    "is so common that numpy provides wrappers that automate these steps: \n",
    "1. [numpy.apply_along_axis](https://numpy.org/doc/stable/reference/generated/numpy.apply_along_axis.html)\n",
    "1. [numpy.apply_over_axes](https://numpy.org/doc/stable/reference/generated/numpy.apply_over_axes.html)\n",
    "1. [numpy.vectorize](https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html)\n",
    "\n",
    "\n",
    "`apply_ufunc` provides an easy interface to `numpy.vectorize` through the keyword argument `vectorize`. Here we see how to use\n",
    "that to automatically apply `np.interp` along a single axis of a nD array\n",
    "\n",
    "## Load data\n",
    "\n",
    "First lets load an example dataset\n",
    "\n",
    "```{tip}\n",
    "We'll reduce the length of error messages using `%xmode minimal` See the [ipython documentation](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-xmode) for details.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import xarray as xr\n",
    "import numpy as np\n",
    "\n",
    "xr.set_options(display_expand_data=False)\n",
    "\n",
    "air = (\n",
    "    xr.tutorial.load_dataset(\"air_temperature\")\n",
    "    .air.sortby(\"lat\")  # np.interp needs coordinate in ascending order\n",
    "    .isel(time=slice(4), lon=slice(3))  # choose a small subset for convenience\n",
    ")\n",
    "air"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Review\n",
    "\n",
    "\n",
    "We'll work with the `apply_ufunc` call from the section on [handling dimensions that change size](complex-output-change-size). See the \"Handling Complex Output\" section for how to get here.\n",
    "\n",
    "This version only works with 1D vectors. We will expand that to work with inputs of any number of dimensions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air.isel(lon=0, time=0),  # this version only works with 1D vectors\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Try nD input\n",
    "\n",
    "Our goal is to interpolate latitude at every longitude and time, such that we go from a dataset with dimensions `(time: 4, lat: 25, lon: 3)` to `(time: 4, lat: 100, lon: 3)`. \n",
    "\n",
    "If we blindly try passing `air` (a 3D DataArray), we get a hard-to-understand error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7",
   "metadata": {
    "tags": []
   },
   "source": [
    "We will use a \"wrapper\" function `debug_interp` to examine what gets passed to `numpy.interp`.\n",
    "\n",
    "```{tip}\n",
    "Such wrapper functions are a great way to understand and debug `apply_ufunc` use cases.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def debug_interp(xi, x, data):\n",
    "    print(f\"data: {data.shape} | x: {x.shape} | xi: {xi.shape}\")\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = xr.apply_ufunc(\n",
    "    debug_interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},  # dimensions allowed to change size. Must be set!\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {
    "tags": []
   },
   "source": [
    "That's a hard-to-interpret error from NumPy but our `print` call helpfully printed the shapes of the input data: \n",
    "\n",
    "    data: (4, 3, 25) | x: (25,) | xi: (100,)\n",
    "\n",
    "We see that `apply_ufunc` passes the full 3D array to `interp1d_np` which in turn passes that on to `numpy.interp`. But `numpy.interp` requires a 1D input, and thus the error.\n",
    "\n",
    "Instead of passing the full 3D array we want loop over all combinations of `lon` and `time`; and apply our function to each corresponding vector of data along `lat`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Vectorization with `np.vectorize`\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {
    "tags": []
   },
   "source": [
    "`apply_ufunc` makes it easy to loop over the loop dimensions by specifying `vectorize=True`:\n",
    "\n",
    "    vectorize : bool, optional\n",
    "        If True, then assume ``func`` only takes arrays defined over core\n",
    "        dimensions as input and vectorize it automatically with\n",
    "        :py:func:`numpy.vectorize`. This option exists for convenience, but is\n",
    "        almost always slower than supplying a pre-vectorized function.\n",
    "        Using this option requires NumPy version 1.12 or newer.\n",
    "        \n",
    "\n",
    "```{warning}\n",
    "Also see the numpy documentation for [numpy.vectorize](https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html). Most importantly\n",
    "\n",
    "    The vectorize function is provided primarily for convenience, not for performance. \n",
    "    The implementation is essentially a for loop.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {
    "tags": [],
    "user_expressions": []
   },
   "outputs": [],
   "source": [
    "interped = xr.apply_ufunc(\n",
    "    debug_interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},  # dimensions allowed to change size. Must be set!\n",
    "    vectorize=True,\n",
    ")\n",
    "interped"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {
    "tags": []
   },
   "source": [
    "Wow that worked!\n",
    "\n",
    "Notice that \n",
    "1. the printed input shapes are all 1D and correspond to one vector of size 25 along the `lat` dimension.\n",
    "2. `debug_interp` was called 4x3 = 12 times which is the total number `lat` vectors since the size along `time` is 4, and the size along `lon` is 3.\n",
    "3. The result `interped` is now an xarray object with coordinate values copied over from `data`. \n",
    "\n",
    "\n",
    "```{note}\n",
    "`lat` is now the *last* dimension in `interped`. This is a \"property\" of core dimensions: they are moved to the end before being sent to `interp1d_np` as noted in the docstring for `input_core_dims`\n",
    "\n",
    "        Core dimensions are automatically moved to the last axes of input\n",
    "        variables before applying ``func``, which facilitates using NumPy style\n",
    "        generalized ufuncs [2]_.\n",
    "```\n",
    "\n",
    "## Conclusion\n",
    "This is why `apply_ufunc` is so convenient; it takes care of a lot of code necessary to apply functions that consume and produce numpy arrays to xarray objects.\n",
    "\n",
    "The `vectorize` keyword argument, when set to True, will use `numpy.vectorize` to apply the function by looping over the \"loop dimensions\" --- dimensions that are not the core dimensions for the applied function."
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/apply_ufunc/complex-output-numpy.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "(complex-output)=\n",
    "# Handling complex output\n",
    "\n",
    "We've seen how to use `apply_ufunc` to handle relatively simple functions that transform every element, or reduce along a single dimension.\n",
    "\n",
    "This lesson will show you how to handle cases where the output is more complex in two ways:\n",
    "1. Handle adding a new dimension by specifying `output_core_dims`\n",
    "1. Handling the change in size of an existing dimension by specifying `exclude_dims` in addition to `output_core_dims`\n",
    "\n",
    "\n",
    "## Introduction\n",
    "\n",
    "A good example of a function that returns relatively complex output is numpy's 1D interpolate function `numpy.interp`:\n",
    "\n",
    "```\n",
    "    Signature: np.interp(x, xp, fp, left=None, right=None, period=None)\n",
    "    Docstring:\n",
    "        One-dimensional linear interpolation.\n",
    "\n",
    "    Returns the one-dimensional piecewise linear interpolant to a function\n",
    "    with given discrete data points (`xp`, `fp`), evaluated at `x`.\n",
    "```\n",
    "\n",
    "This function expects a 1D array as input, and returns a 1D array as output. That is, `numpy.interp` has one core dimension.\n",
    "\n",
    "\n",
    "```{tip}\n",
    "We'll reduce the length of error messages using `%xmode minimal` See the [ipython documentation](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-xmode) for details.\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import xarray as xr\n",
    "import numpy as np\n",
    "\n",
    "np.set_printoptions(threshold=10, edgeitems=2)\n",
    "xr.set_options(display_expand_data=False)\n",
    "\n",
    "air = (\n",
    "    xr.tutorial.load_dataset(\"air_temperature\")\n",
    "    .air.sortby(\"lat\")  # np.interp needs coordinate in ascending order\n",
    "    .isel(time=-0, lon=0)  # choose a 1D subset\n",
    ")\n",
    "air"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Our goal is to densify from 25 to 100 coordinate values:s\n",
    "newlat = np.linspace(15, 75, 100)\n",
    "np.interp(newlat, air.lat.data, air.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {
    "tags": []
   },
   "source": [
    "(interp-add-new-dim)=\n",
    "## Adding a new dimension\n",
    "\n",
    "1D interpolation transforms the size of the input along a single dimension.\n",
    "\n",
    "Logically, we can think of this as removing the old dimension and adding a new dimension.\n",
    "\n",
    "We provide this information to `apply_ufunc` using the `output_core_dims` keyword argument\n",
    "\n",
    "```\n",
    "   output_core_dims : List[tuple], optional\n",
    "        List of the same length as the number of output arguments from\n",
    "        ``func``, giving the list of core dimensions on each output that were\n",
    "        not broadcast on the inputs. By default, we assume that ``func``\n",
    "        outputs exactly one array, with axes corresponding to each broadcast\n",
    "        dimension.\n",
    "\n",
    "        Core dimensions are assumed to appear as the last dimensions of each\n",
    "        output in the provided order.\n",
    "```\n",
    "\n",
    "For `interp` we expect one returned output with one new core dimension that we will call `\"lat_interp\"`.\n",
    "\n",
    "Specify this using `output_core_dims=[[\"lat_interp\"]]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # function to apply\n",
    "    newlat,  # 1st input to np.interp\n",
    "    air.lat,  # 2nd input to np.interp\n",
    "    air,  # 3rd input to np.interp\n",
    "    input_core_dims=[[\"lat_interp\"], [\"lat\"], [\"lat\"]],  # one entry per function input, 3 in total!\n",
    "    output_core_dims=[[\"lat_interp\"]],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {
    "tags": []
   },
   "source": [
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "\n",
    "Apply the following function using `apply_ufunc`. It adds a new dimension to the input array, let's call it `newdim`. Specify the new dimension using `output_core_dims`. Do you need any `input_core_dims`?\n",
    "\n",
    "```python\n",
    "def add_new_dim(array):\n",
    "    return np.expand_dims(array, axis=-1)\n",
    "```\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "```python\n",
    "def add_new_dim(array):\n",
    "    return np.expand_dims(array, axis=-1)\n",
    "\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    add_new_dim,\n",
    "    air,\n",
    "    output_core_dims=[[\"newdim\"]],\n",
    ")\n",
    "```\n",
    ":::\n",
    "::::"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {
    "tags": [],
    "user_expressions": []
   },
   "source": [
    "(complex-output-change-size)=\n",
    "## Dimensions that change size\n",
    "\n",
    "Imagine that you want the output to have the same dimension name `\"lat\"` i.e. applying`np.interp` changes the size of the `\"lat\"` dimension.\n",
    "\n",
    "We get an a error if we specify `\"lat\"` in `output_core_dims`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {
    "tags": [],
    "user_expressions": []
   },
   "source": [
    "As the error message points out,\n",
    "```\n",
    "Only dimensions specified in ``exclude_dims`` with xarray.apply_ufunc are allowed to change size.\n",
    "```\n",
    "\n",
    "Looking at the docstring we need to specify `exclude_dims` as a \"set\":\n",
    "\n",
    "```\n",
    "exclude_dims : set, optional\n",
    "        Core dimensions on the inputs to exclude from alignment and\n",
    "        broadcasting entirely. Any input coordinates along these dimensions\n",
    "        will be dropped. Each excluded dimension must also appear in\n",
    "        ``input_core_dims`` for at least one argument. Only dimensions listed\n",
    "        here are allowed to change size between input and output objects.\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    air.lat,\n",
    "    air,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Returning multiple variables\n",
    "\n",
    "Another common, but more complex, case is to handle multiple outputs returned by the function.\n",
    "\n",
    "As an example we will write a function that returns the minimum and maximum value along the last axis of the array.\n",
    "\n",
    "We will work with a 2D array, and apply the function `minmax` along the `\"lat\"` dimension:\n",
    "```python\n",
    "def minmax(array):\n",
    "    return array.min(axis=-1), array.max(axis=-1)\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def minmax(array):\n",
    "    return array.min(axis=-1), array.max(axis=-1)\n",
    "\n",
    "\n",
    "air2d = xr.tutorial.load_dataset(\"air_temperature\").air.isel(time=0)\n",
    "air2d"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {
    "tags": [],
    "user_expressions": []
   },
   "source": [
    "By default, Xarray assumes one array is returned by the applied function.\n",
    "\n",
    "Here we have two returned arrays, and the input core dimension `\"lat\"` is removed (or reduced over).\n",
    "\n",
    "So we provide `output_core_dims=[[], []]` i.e. an empty list of core dimensions for each of the two returned arrays."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "minda, maxda = xr.apply_ufunc(\n",
    "    minmax,\n",
    "    air2d,\n",
    "    input_core_dims=[[\"lat\"]],\n",
    "    output_core_dims=[[], []],\n",
    ")\n",
    "minda"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {
    "tags": []
   },
   "source": [
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "\n",
    "We presented the concept of \"core dimensions\" as the \"smallest unit of data the function could handle.\" Do you understand how the above use of `apply_ufunc` generalizes to an array with more than one dimension? \n",
    "\n",
    "Try applying the minmax function to a 3d air temperature dataset \n",
    "```python\n",
    "air3d = xr.tutorial.load_dataset(\"air_temperature\").air\n",
    "``` \n",
    "Your goal is to have a minimum and maximum value of temperature across all latitudes for a given time and longitude.\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "We want to use `minmax` to compute the minimum and maximum along the \"lat\" dimension always, regardless of how many dimensions are on the input. So we specify `input_core_dims=[[\"lat\"]]`. The output does not contain the \"lat\" dimension, but we expect two returned variables. So we pass an empty list `[]` for each returned array, so `output_core_dims=[[], []]` just as before.\n",
    "\n",
    "\n",
    "```python\n",
    "minda, maxda = xr.apply_ufunc(\n",
    "    minmax,\n",
    "    air3d,\n",
    "    input_core_dims=[[\"lat\"]],\n",
    "    output_core_dims=[[],[]],\n",
    ")\n",
    ":::\n",
    "::::"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/apply_ufunc/core-dimensions.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Core dimensions\n",
    "\n",
    "[Previously](gentle-intro) we learned to use `apply_ufunc` on simple functions that acted element by element. \n",
    "\n",
    "Here we move on to slightly more complex functions like `np.mean` that can act along a subset of an input array's dimensions.\n",
    "\n",
    "Such operations involve the concept of \"core dimensions\". \n",
    "\n",
    "Our learning goals are:\n",
    "- Learn how to identify \"core dimensions\" for the function you're applying.\n",
    "- Learn that \"core dimensions\" are automatically moved or transposed to the end of the array.\n",
    "\n",
    "\n",
    "## Introduction\n",
    "\n",
    "For using more complex operations that consider some array values collectively,\n",
    "it’s important to understand the idea of **core dimensions**. \n",
    "Usually, they correspond to the fundamental dimensions over\n",
    "which an operation is defined, e.g., the summed axis in `np.sum`. One way to think about core dimensions \n",
    "is to consider the smallest dimensionality of data that the function acts on.\n",
    "\n",
    "```{important}\n",
    "\n",
    "A good clue that core dimensions are needed is the presence of an `axis` argument on the\n",
    "corresponding NumPy function.\n",
    "\n",
    "```\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import numpy as np\n",
    "import xarray as xr\n",
    "\n",
    "# limit the amount of information printed to screen\n",
    "xr.set_options(display_expand_data=False)\n",
    "np.set_printoptions(threshold=10, edgeitems=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3",
   "metadata": {},
   "source": [
    "Let's load a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = xr.tutorial.load_dataset(\"air_temperature\")\n",
    "ds"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5",
   "metadata": {},
   "source": [
    "## Reducing with `np.mean`\n",
    "\n",
    "Let's write a function that computes the mean along `time` for a provided xarray object. \n",
    "\n",
    "This function requires one core dimension `time`. For `ds.air` note that `time` is the 0th axis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ds.air.dims"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7",
   "metadata": {
    "tags": []
   },
   "source": [
    "`get_axis_num` is a useful method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ds.air.get_axis_num(\"time\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(ds.air, axis=ds.air.get_axis_num(\"time\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(ds.air.data, axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "Let's try to use `apply_ufunc` to replicate `np.mean(ds.air.data, axis=0)`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    # function to apply\n",
    "    np.mean,\n",
    "    # object with data to pass to function\n",
    "    ds,\n",
    "    # keyword arguments to pass to np.mean\n",
    "    kwargs={\"axis\": 0},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {
    "tags": []
   },
   "source": [
    "The error here\n",
    "```\n",
    "applied function returned data with unexpected number of dimensions. \n",
    "Received 2 dimension(s) but expected 3 dimensions with names: ('time', 'lat', 'lon')\n",
    "```\n",
    "\n",
    "means that while `np.mean` did indeed reduce one dimension, we did not tell `apply_ufunc` that this would happen. That is, we need to specify the core dimensions on the input.\n",
    "\n",
    "Do that by passing a list of dimension names for each input object. For this function we have one input : `ds` and with a single core dimension `\"time\"` so we have `input_core_dims=[[\"time\"]]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    np.mean,\n",
    "    ds,\n",
    "    # specify core dimensions as a list of lists\n",
    "    # here 'time' is the core dimension on `ds`\n",
    "    input_core_dims=[\n",
    "        [\"time\"],  # core dimension for ds\n",
    "    ],\n",
    "    kwargs={\"axis\": 0},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15",
   "metadata": {
    "tags": []
   },
   "source": [
    "This next error is a little confusing.\n",
    "\n",
    "```\n",
    "size of dimension 'lat' on inputs was unexpectedly changed by applied function from 25 to 53. \n",
    "Only dimensions specified in ``exclude_dims`` with xarray.apply_ufunc are allowed to change size.\n",
    "```\n",
    "\n",
    "\n",
    "A good trick here is to pass a little wrapper function to `apply_ufunc` instead and inspect the shapes of data received by the wrapper.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def wrapper(array, **kwargs):\n",
    "    print(f\"received {type(array)} shape: {array.shape}, kwargs: {kwargs}\")\n",
    "    result = np.mean(array, **kwargs)\n",
    "    print(f\"result.shape: {result.shape}\")\n",
    "    return result\n",
    "\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    wrapper,\n",
    "    ds,\n",
    "    # specify core dimensions as a list of lists\n",
    "    # here 'time' is the core dimension on `ds`\n",
    "    input_core_dims=[[\"time\"]],\n",
    "    kwargs={\"axis\": 0},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {},
   "source": [
    "Now we see the issue:\n",
    "\n",
    "    received <class 'numpy.ndarray'> shape: (25, 53, 2920), kwargs: {'axis': 0}\n",
    "    result.shape: (53, 2920)\n",
    "    \n",
    "The `time` dimension is of size `2920` and is now the last axis of the array but was initially the first axis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.air.get_axis_num(\"time\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {
    "tags": []
   },
   "source": [
    "```{important}\n",
    "This illustrates an important concept. Arrays are transposed so that core dimensions are at the end.\n",
    "```\n",
    "\n",
    "With `apply_ufunc`, core dimensions are recognized by name, and then moved to\n",
    "the last dimension of any input arguments before applying the given function.\n",
    "This means that for functions that accept an `axis` argument, you usually need\n",
    "to set `axis=-1`\n",
    "\n",
    "Such behaviour means that our functions (like `wrapper` or `np.mean`) do not need to know the exact order of dimensions. They can rely on the core dimensions being at the end allowing us to write very general code! \n",
    "\n",
    "We can fix our `apply_ufunc` call by specifying `axis=-1` instead."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "def wrapper(array, **kwargs):\n",
    "    print(f\"received {type(array)} shape: {array.shape}, kwargs: {kwargs}\")\n",
    "    result = np.mean(array, **kwargs)\n",
    "    print(f\"result.shape: {result.shape}\")\n",
    "    return result\n",
    "\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    wrapper,\n",
    "    ds,\n",
    "    input_core_dims=[[\"time\"]],\n",
    "    kwargs={\"axis\": -1},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {
    "tags": []
   },
   "source": [
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "\n",
    "Use `apply_ufunc` to apply `scipy.integrate.trapezoid` along the `time` axis.\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "```python\n",
    "import scipy as sp\n",
    "import scipy.integrate\n",
    "\n",
    "xr.apply_ufunc(scipy.integrate.trapezoid, ds, input_core_dims=[[\"time\"]], kwargs={\"axis\": -1})\n",
    "```\n",
    ":::\n",
    "::::"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/apply_ufunc/dask_apply_ufunc.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Handling dask arrays\n",
    "\n",
    "We have previously worked over applying functions to NumPy arrays contained in Xarray objects.\n",
    "`apply_ufunc` also lets you easily perform many of the steps involving in applying \n",
    "functions that expect and return Dask arrays.\n",
    "\n",
    "Learning goals:\n",
    "- Learn that `apply_ufunc` can automate aspects of applying computation functions on dask arrays\n",
    "- It is possible to automatically parallelize certain operations by providing `dask=\"parallelized\"`\n",
    "- In some cases, extra information needs to be provided such as sizes of any new dimensions added, or data types for output variables.\n",
    "- Learn that all the concepts from the numpy lessons carry over: like [automatic vectorization](vectorize) and specifying input and\n",
    "  output core dimensions.\n",
    "\n",
    "\n",
    "```{tip}\n",
    "We'll reduce the length of error messages using `%xmode minimal` See the [ipython documentation](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-xmode) for details.\n",
    "```\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import dask\n",
    "import numpy as np\n",
    "import xarray as xr\n",
    "\n",
    "# limit the amount of information printed to screen\n",
    "xr.set_options(display_expand_data=False)\n",
    "np.set_printoptions(threshold=10, edgeitems=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "First lets set up a `LocalCluster` using [dask.distributed](https://distributed.dask.org/).\n",
    "\n",
    "You can use any kind of dask cluster. This step is completely independent of\n",
    "xarray. While not strictly necessary, the dashboard provides a nice learning\n",
    "tool.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client\n",
    "\n",
    "client = Client()\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {},
   "source": [
    "<p>&#128070</p> Click the Dashboard link above. Or click the \"Search\" button in the dashboard.\n",
    "\n",
    "Let's test that the dashboard is working..\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import dask.array\n",
    "\n",
    "dask.array.ones((1000, 4), chunks=(2, 1)).compute()  # should see activity in dashboard"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "Let's open a dataset. We specify `chunks` so that we create a dask arrays for the DataArrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = xr.tutorial.open_dataset(\"air_temperature\", chunks={\"time\": 100})\n",
    "ds"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {
    "tags": []
   },
   "source": [
    "## A simple example\n",
    "\n",
    "All the concepts from applying numpy functions carry over.\n",
    "\n",
    "However the handling of dask arrays needs to be explicitly activated.\n",
    "\n",
    "There are three options for the `dask` kwarg.\n",
    "\n",
    "```\n",
    "    dask : {\"forbidden\", \"allowed\", \"parallelized\"}, default: \"forbidden\"\n",
    "        How to handle applying to objects containing lazy data in the form of\n",
    "        dask arrays:\n",
    "\n",
    "        - 'forbidden' (default): raise an error if a dask array is encountered.\n",
    "        - 'allowed': pass dask arrays directly on to ``func``. Prefer this option if\n",
    "          ``func`` natively supports dask arrays.\n",
    "        - 'parallelized': automatically parallelize ``func`` if any of the\n",
    "          inputs are a dask array by using :py:func:`dask.array.apply_gufunc`. Multiple output\n",
    "          arguments are supported. Only use this option if ``func`` does not natively\n",
    "          support dask arrays (e.g. converts them to numpy arrays).\n",
    "```\n",
    "\n",
    "We will work through the following two:\n",
    "\n",
    "1. `dask=\"allowed\"` Dask arrays are passed to the user function. This is a good\n",
    "   choice if your function can handle dask arrays and won't compute the result unless \n",
    "   explicitly requested.\n",
    "2. `dask=\"parallelized\"`. This applies the user function over blocks of the dask\n",
    "   array using `dask.array.apply_gufunc`. This is useful when your function cannot\n",
    "   handle dask arrays natively (e.g. scipy API)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "# Expect an error here\n",
    "def squared_error(x, y):\n",
    "    return (x - y) ** 2\n",
    "\n",
    "\n",
    "xr.apply_ufunc(squared_error, ds.air, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10",
   "metadata": {
    "tags": []
   },
   "source": [
    "  \n",
    "A good thing to check is whether the applied function (here `squared_error`) can handle pure dask arrays. \n",
    "To do this call  `squared_error(ds.air.data, 1)` and make sure of the following:\n",
    "1. That you don't see any activity on the dask dashboard\n",
    "2. That the returned result is a dask array."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "squared_error(ds.air.data, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {
    "tags": []
   },
   "source": [
    "Since `squared_error` can handle dask arrays without computing them, we specify\n",
    "`dask=\"allowed\"`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {},
   "outputs": [],
   "source": [
    "sqer = xr.apply_ufunc(\n",
    "    squared_error,\n",
    "    ds.air,\n",
    "    1,\n",
    "    dask=\"allowed\",\n",
    ")\n",
    "sqer  # dask-backed DataArray! with nice metadata!"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Understanding what's happening\n",
    "\n",
    "Let's again use the wrapper trick to understand what `squared_error` receives.\n",
    "\n",
    "We see that it receives a dask array (analogous to the numpy array in the previous example)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "def wrapper(x, y):\n",
    "    print(f\"received x of type {type(x)}, shape {x.shape}\")\n",
    "    print(f\"received y of type {type(y)}\")\n",
    "    return squared_error(x, y)\n",
    "\n",
    "\n",
    "xr.apply_ufunc(wrapper, ds.air, 1, dask=\"allowed\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Core dimensions\n",
    "\n",
    "`squared_error` operated on a per-element basis. How about a reduction like `np.mean`?\n",
    "\n",
    "Such functions involve the concept of \"core dimensions\". This concept is independent of the underlying array type, and is a property of the applied function. See the [core dimensions with NumPy](core-dimensions) tutorial for more.\n",
    "\n",
    "\n",
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "\n",
    "Use `dask.array.mean` as an example of a function that can handle dask\n",
    "arrays and uses an `axis` kwarg. \n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "```python\n",
    "def time_mean(da):\n",
    "    return xr.apply_ufunc(\n",
    "        dask.array.mean,\n",
    "        da,\n",
    "        input_core_dims=[[\"time\"]],\n",
    "        dask=\"allowed\",\n",
    "        kwargs={\"axis\": -1},  # core dimensions are moved to the end\n",
    "    )\n",
    "    \n",
    "time_mean(ds.air)\n",
    "```\n",
    ":::\n",
    "::::\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17",
   "metadata": {
    "tags": []
   },
   "source": [
    "Again, this is identical to the built-in `mean`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_mean(da):\n",
    "    return xr.apply_ufunc(\n",
    "        dask.array.mean,\n",
    "        da,\n",
    "        input_core_dims=[[\"time\"]],\n",
    "        dask=\"allowed\",\n",
    "        kwargs={\"axis\": -1},  # core dimensions are moved to the end\n",
    "    )\n",
    "\n",
    "\n",
    "ds.air.mean(\"time\").identical(time_mean(ds.air))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Automatically parallelizing dask-unaware functions\n",
    "\n",
    "### Basics\n",
    "\n",
    "Not all functions can handle dask arrays appropriately by default.\n",
    "\n",
    "A very useful `apply_ufunc` feature is the ability to apply arbitrary functions\n",
    "in parallel to each block. This ability can be activated using\n",
    "`dask=\"parallelized\"`. \n",
    "\n",
    "We will use `scipy.integrate.trapezoid` as an example of a function that cannot\n",
    "handle dask arrays and requires a core dimension. If we call `trapezoid` with a dask\n",
    "array, we get a numpy array back that is, the values have been eagerly computed.\n",
    "This is undesirable behaviour\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "import scipy as sp\n",
    "import scipy.integrate\n",
    "\n",
    "sp.integrate.trapezoid(\n",
    "    ds.air.data, axis=ds.air.get_axis_num(\"lon\")\n",
    ")  # does NOT return a dask array, you should see activity on the dashboard"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {
    "tags": []
   },
   "source": [
    "Let's activate automatic parallelization by using `apply_ufunc` with `dask=\"parallelized\"`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "integrated = xr.apply_ufunc(\n",
    "    sp.integrate.trapezoid,\n",
    "    ds,\n",
    "    input_core_dims=[[\"lon\"]],\n",
    "    kwargs={\"axis\": -1},\n",
    "    dask=\"parallelized\",\n",
    ")\n",
    "integrated"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23",
   "metadata": {
    "tags": []
   },
   "source": [
    "And make sure the returned data is a dask array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "integrated.air.data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {},
   "source": [
    "Now you have control over executing this parallel computation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dask -> Numpy array of integrated values\n",
    "parallelized_results = integrated.compute()\n",
    "parallelized_results"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Understanding `dask=\"parallelized\"`\n",
    "\n",
    "It is very important to understand what `dask=\"parallelized\"` does. To fully understand it, requires understanding some core concepts.\n",
    "\n",
    "```{seealso}\n",
    "For `dask=\"parallelized\"` `apply_ufunc` will call `dask.array.apply_gufunc`. See the dask documentation on [generalized ufuncs](https://docs.dask.org/en/stable/array-gufunc.html) and [`apply_gufunc`](https://docs.dask.org/en/stable/generated/dask.array.gufunc.apply_gufunc.html) for more.\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Embarrassingly parallel or blockwise operations\n",
    "\n",
    "`dask=\"parallelized\"` works well for \"blockwise\" or \"embarrassingly parallel\" operations ([Wikipedia](https://en.wikipedia.org/wiki/Embarrassingly_parallel)).\n",
    "\n",
    "These are operations where one block or chunk of the output array corresponds to one block or chunk of the input array. Specifically, the blocks or chunks of the _core dimension_ is what matters. Importantly, no communication between blocks is necessary to create the output, which makes parallelization quite simple or \"embarrassing\".\n",
    "\n",
    "Let's look at the dask repr for `ds` and note chunksizes are (100,25,53) for a array with shape (2920, 25, 53). This means that each block or chunk of the array contains all `lat`, `lon` points and a subset of `time` points."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ds.air.data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30",
   "metadata": {
    "tags": []
   },
   "source": [
    "The core dimension for `trapezoid` is `lon`, and there is only one chunk along `lon`. This means that integrating along `lon` is a \"blockwise\" or \"embarrassingly parallel\" operation and `dask=\"parallelized\"` works quite well. \n",
    "\n",
    "```{caution} Question\n",
    "Do you understand why `integrate(ds)` when `ds` has a single chunk along `lon` is a \"embarrassingly parallel\" operation?\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31",
   "metadata": {
    "tags": []
   },
   "source": [
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "Apply the integrate function to `ds` after rechunking to have a different chunksize along `lon` using `ds.chunk(lon=4)` (for example). What happens?\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "`apply_ufunc` complains that it cannot automatically parallelize because the dataset `ds` is now chunked along the core dimension `lon`. You should see the following error:\n",
    "\n",
    "    ValueError: dimension lon on 0th function argument to apply_ufunc with dask='parallelized' \n",
    "    consists of multiple chunks, but is also a core dimension. To fix, either rechunk \n",
    "    into a single array chunk along this dimension, i.e., \n",
    "    ``.chunk(dict(lon=-1))``, or pass ``allow_rechunk=True`` in ``dask_gufunc_kwargs`` \n",
    "    but beware that this may significantly increase memory usage.\n",
    "\n",
    ":::\n",
    "::::"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "32",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Understanding execution\n",
    "\n",
    "We are layering many concepts together there so it is important to understand how the function is executed, and what input it will receive. Again we will use our wrapper trick."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def integrate_wrapper(array, **kwargs):\n",
    "    print(f\"received array of type {type(array)}, shape {array.shape}\")\n",
    "    result = sp.integrate.trapezoid(array, **kwargs)\n",
    "    print(f\"received array of type {type(result)}, shape {result.shape}\")\n",
    "    return result\n",
    "\n",
    "\n",
    "integrated = xr.apply_ufunc(\n",
    "    integrate_wrapper,\n",
    "    ds,\n",
    "    input_core_dims=[[\"lon\"]],\n",
    "    kwargs={\"axis\": -1},\n",
    "    dask=\"parallelized\",\n",
    ")\n",
    "integrated"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34",
   "metadata": {},
   "source": [
    "Note that we received an Xarray object back (`integrated`) but our wrapper function was called with a numpy array of shape `(1,1,1)`.\n",
    "\n",
    "```{important}\n",
    "the full 3D array has **not yet been** passed to `integrate_wrapper`. Yet dask needs to know the shape and dtype of the result. This is key. \n",
    "```\n",
    "\n",
    "The `integrate_wrapper` function is treated like a black box, and its effect on the inputs has to either be described through additional keyword arguments, or inferred by passing dummy inputs.\n",
    "\n",
    "To do so, `dask.array.apply_gufunc` calls the user function with dummy inputs (here a numpy array of shape `(1,1,1)`), and inspects the returned value to understand that one dimension was removed (returned a numpy array of shape `(1,1)`.\n",
    "\n",
    "````{caution}\n",
    ":class: dropdown\n",
    "\n",
    "Some functions can have trouble handling such dummy inputs. Alternatively you can pass `meta = np.ones((1,1))` in `dask_gufunc_kwargs` to prevent dask from providing dummy inputs to the array.\n",
    "```python\n",
    "xr.apply_ufunc(\n",
    "    integrate_wrapper,\n",
    "    ds,\n",
    "    input_core_dims=[[\"lon\"]],\n",
    "    kwargs={\"axis\": -1},\n",
    "    dask=\"parallelized\",\n",
    "    dask_gufunc_kwargs={\"meta\": np.ones((1,1))},\n",
    ")\n",
    "```\n",
    "````\n",
    "\n",
    "Since no errors were raised we proceed as-is.\n",
    "\n",
    "Let's compute the array to get real values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35",
   "metadata": {
    "tags": [
     "output-scroll"
    ]
   },
   "outputs": [],
   "source": [
    "integrated.compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36",
   "metadata": {},
   "source": [
    "We see that `integrate_wrapper` is called many times! As many times as there are blocks in the array in fact, which is 30 here (`ds.air.data.numblocks`).\n",
    "\n",
    "Our function is independently executed on each block of the array, and then the results are concatenated to form the final result.\n",
    "\n",
    "Conceptually, there is a two-way flow of information between various packages when executing `integrated.compute()`:\n",
    "\n",
    "`xarray.apply_ufunc` ↔ `dask.array.apply_gufunc` ↔ `integrate_wrapper` ↔ `scipy.integrate.trapezoid` ↔ `ds.air.data`\n",
    "\n",
    "\n",
    "When executed\n",
    "\n",
    "1. Xarray loops over all data variables.\n",
    "1. Xarray unwraps the underlying dask array (e.g. `ds.air`) and passes that to dask's `apply_gufunc`.\n",
    "1. `apply_gufunc` calls `integrate_wrapper` on each block of the array.\n",
    "1. For each block, `integrate_wrapper` calls `scipy.integrate.trapezoid` and returns one block of the output array.\n",
    "1. dask stitches all the output blocks to form the output array.\n",
    "1. `xarray.apply_ufunc` wraps the output array with Xarray metadata to give the final result.\n",
    "\n",
    "Phew!\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37",
   "metadata": {
    "tags": []
   },
   "source": [
    "## More complex situations\n",
    "\n",
    "Here we quickly demonstrate that all the concepts from the numpy material earlier carry over.\n",
    "\n",
    "Xarray needs a lot of extra metadata, so depending\n",
    "on the function, extra arguments such as `output_dtypes` and `output_sizes` may\n",
    "be necessary for supporting dask arrays. We demonstrate this below."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Adding new dimensions\n",
    "\n",
    "We use the `np.expand_dims` to change the size of the input along a single dimension.\n",
    "\n",
    "```python\n",
    "def add_new_dim(array):\n",
    "    return np.expand_dims(array, axis=0)\n",
    "```\n",
    "\n",
    "When automatically parallelizing with `dask`, we need to provide some more information about the outputs.\n",
    "1. When adding a new dimensions, we need to provide the size in `dask_gufunc_kwargs` using the key `output_sizes`\n",
    "2. Usually we need provide the datatype or `dtype` of the returned array. Usually the dtype of the input is a good guess."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def add_new_dim(array):\n",
    "    return np.expand_dims(array, axis=-1)\n",
    "\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    add_new_dim,  # first the function\n",
    "    ds.air.chunk({\"time\": 2, \"lon\": 2}),\n",
    "    output_core_dims=[[\"newdim\"]],\n",
    "    dask=\"parallelized\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40",
   "metadata": {},
   "source": [
    "Provide the size of the newly added dimension `newdim` in `output_sizes` as part of the `dask_gufunc_kwargs` keyword argument:\n",
    "\n",
    "    dask_gufunc_kwargs (dict, optional) – Optional keyword arguments passed to dask.array.apply_gufunc() \n",
    "    if dask=’parallelized’. Possible keywords are output_sizes, allow_rechunk and meta.\n",
    "    \n",
    "The syntax is \n",
    "```python\n",
    "dask_gufunc_kwargs={\n",
    "    \"output_sizes\": {\"newdim\": 1}\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    add_new_dim,  # first the function\n",
    "    ds.air.chunk({\"time\": 2, \"lon\": 2}),\n",
    "    output_core_dims=[[\"newdim\"]],\n",
    "    dask=\"parallelized\",\n",
    "    dask_gufunc_kwargs={\"output_sizes\": {\"newdim\": 1}},\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42",
   "metadata": {},
   "source": [
    "### Dimensions that change size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43",
   "metadata": {},
   "source": [
    "We will now repeat the [interpolation example from earlier](interp-add-new-dim) with `\"lat\"` as the output core dimension. See the numpy notebook on [complex output](complex-output) for more.\n",
    "\n",
    "```python\n",
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,\n",
    "    newlat,\n",
    "    ds.air.lat,\n",
    "    ds.air.chunk({\"time\": 2, \"lon\": 2}),\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    ")\n",
    "```\n",
    "\n",
    "We will first add `dask=\"parallelized\"` and provide `output_sizes` in `dask_gufunc_kwargs`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    ds.air.lat,\n",
    "    ds.air.chunk({\"time\": 2, \"lon\": 2}),\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    "    # The following are dask-specific\n",
    "    dask=\"parallelized\",\n",
    "    dask_gufunc_kwargs=dict(output_sizes={\"lat\": len(newlat)}),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45",
   "metadata": {},
   "source": [
    "This error means that we need to provide `output_dtypes`\n",
    "\n",
    "    output_dtypes (list of dtype, optional) – Optional list of output dtypes. \n",
    "    Only used if dask='parallelized' or vectorize=True."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "\n",
    "xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    ds.air.lat,\n",
    "    ds.air.chunk({\"time\": 100, \"lon\": -1}),\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},\n",
    "    # The following are dask-specific\n",
    "    dask=\"parallelized\",\n",
    "    dask_gufunc_kwargs=dict(output_sizes={\"lat\": len(newlat)}),\n",
    "    output_dtypes=[ds.air.dtype],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47",
   "metadata": {},
   "source": [
    "```{tip}\n",
    "Dask can sometimes figure out the output sizes and dtypes. The usual workflow is to read the error messages and iteratively pass more information to `apply_ufunc`.\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "48",
   "metadata": {},
   "source": [
    "### Automatic Vectorizing\n",
    "\n",
    "[Automatic vectorizing](vectorize) with `vectorize=True` also carries over!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "interped = xr.apply_ufunc(\n",
    "    np.interp,  # first the function\n",
    "    newlat,\n",
    "    ds.air.lat,\n",
    "    ds.chunk({\"time\": 100, \"lon\": -1}),\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"lat\"]],\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims={\"lat\"},  # dimensions allowed to change size. Must be set!\n",
    "    dask=\"parallelized\",\n",
    "    dask_gufunc_kwargs=dict(output_sizes={\"lat\": len(newlat)}),\n",
    "    output_dtypes=[ds.air.dtype],\n",
    "    vectorize=True,\n",
    ")\n",
    "interped"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50",
   "metadata": {},
   "source": [
    "Again, it is important to understand the conceptual flow of information between the variuus packages when executing `interped.compute()` which looks ilke\n",
    "\n",
    "`xarray.apply_ufunc` ↔ `dask.array.apply_gufunc` ↔ `numpy.vectorize` ↔ `numpy.interp`\n",
    "\n",
    "\n",
    "When executed\n",
    "\n",
    "1. Xarray loops over all data variables.\n",
    "1. Xarray unwraps the underlying dask array (e.g. `ds.air`) and passes that to dask's `apply_gufunc`.\n",
    "1. `apply_gufunc` calls the vectorized function on each block of the array.\n",
    "1. For each block, `numpy.vectorize` handles looping over the loop dimensions \n",
    "   and passes 1D vectors along the core dimension to `numpy.interp`\n",
    "1. The 1D results for each block are concatenated by `numpy.vectorize` to create one output block.\n",
    "1. dask stitches all the output blocks to form the output array.\n",
    "1. `xarray.apply_ufunc` wraps the output array with Xarray metadata to give the final result.\n",
    "\n",
    "Phew!\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Clean up the cluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52",
   "metadata": {
    "tags": [
     "remove-output"
    ]
   },
   "outputs": [],
   "source": [
    "client.close();"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/apply_ufunc/example-interp.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# np.interp : An end-to-end example\n",
    "\n",
    "**Author** [Deepak Cherian (NCAR)](https://cherian.net)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "This example will illustrate how to conveniently apply an unvectorized function `func` to xarray objects using `apply_ufunc`. `func` expects 1D numpy arrays and returns a 1D numpy array. Our goal is to conveniently apply this function along a dimension of xarray objects that may or may not wrap dask arrays with a signature.\n",
    "\n",
    "We will illustrate this using [`np.interp`](https://numpy.org/doc/stable/reference/generated/numpy.interp.html): \n",
    "\n",
    "    Signature: np.interp(x, xp, fp, left=None, right=None, period=None)\n",
    "    Docstring:\n",
    "        One-dimensional linear interpolation.\n",
    "\n",
    "    Returns the one-dimensional piecewise linear interpolant to a function\n",
    "    with given discrete data points (`xp`, `fp`), evaluated at `x`.\n",
    "\n",
    "and write an `xr_interp` function with signature\n",
    "\n",
    "    xr_interp(xarray_object, dimension_name, new_coordinate_to_interpolate_to)\n",
    "    \n",
    "    \n",
    "## Learning goals \n",
    "\n",
    "Our goal is to use `apply_ufunc` with a general function so that we can reuse our code to apply to different xarray datasets or along different dimensions. Specifically, this example will illustrate \n",
    "1. Specifying core dimensions with `input_core_dims`\n",
    "1. Handling core dimensions of the output with `output_core_dims`\n",
    "1. Handling core dimensions that change size using `exclude_dims`\n",
    "1. Automatic vectorizing or looping over dimensions that are not core dimensions using `vectorize=True`\n",
    "1. Automatically parallelization with dask arrays using `dask=\"parallelized\"`\n",
    "1. High-performance vectorization with numba and `vectorize=False`.\n",
    "\n",
    "It puts together all the concepts covered earlier.\n",
    "\n",
    "\n",
    "```{tip}\n",
    "We'll reduce the length of error messages using in this tutorial using `%xmode minimal` See the [ipython documentation](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-xmode) for details.\n",
    "```\n",
    "\n",
    "## Load data\n",
    "\n",
    "First lets load an example dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import xarray as xr\n",
    "import numpy as np\n",
    "\n",
    "# limit the amount of information printed to screen\n",
    "xr.set_options(display_expand_data=False)\n",
    "np.set_printoptions(threshold=10, edgeitems=2)\n",
    "\n",
    "air = (\n",
    "    xr.tutorial.load_dataset(\"air_temperature\")\n",
    "    .air.sortby(\"lat\")  # np.interp needs coordinate in ascending order\n",
    "    .isel(time=slice(4), lon=slice(3))\n",
    ")  # choose a small subset for convenience\n",
    "air"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "The function we will apply is `np.interp` which expects 1D numpy arrays. This functionality is already implemented in xarray so we use that capability to make sure we are not making mistakes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "newlat = np.linspace(15, 75, 100)\n",
    "air.interp(lat=newlat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's define a function that works with one vector of data along `lat` at a time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def interp1d_np(data, x, xi):\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = interp1d_np(air.isel(time=0, lon=0), air.lat, newlat)\n",
    "expected = air.interp(lat=newlat)\n",
    "\n",
    "# no errors are raised if values are equal to within floating point precision\n",
    "np.testing.assert_allclose(expected.isel(time=0, lon=0).values, interped)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No errors are raised so our interpolation is working.\n",
    "\n",
    "This function consumes and returns numpy arrays, which means we need to do a lot of work to convert the result back to an xarray object with meaningful metadata. This is where `apply_ufunc` is very useful.\n",
    "\n",
    "## `apply_ufunc`\n",
    "\n",
    "    Apply a vectorized function for unlabeled arrays on xarray objects.\n",
    "\n",
    "    The function will be mapped over the data variable(s) of the input arguments using \n",
    "    xarray’s standard rules for labeled computation, including alignment, broadcasting, \n",
    "    looping over GroupBy/Dataset variables, and merging of coordinates.\n",
    "    \n",
    "`apply_ufunc` has many capabilities but for simplicity this example will focus on the common task of vectorizing 1D functions over nD xarray objects. We will iteratively build up the right set of arguments to `apply_ufunc` and read through many error messages in doing so."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(time=0, lon=0),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`apply_ufunc` needs to know a lot of information about what our function does so that it can reconstruct the outputs. In this case, the size of dimension lat has changed and we need to explicitly specify that this will happen. xarray helpfully tells us that we need to specify the kwarg `exclude_dims`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## `exclude_dims`\n",
    "\n",
    "\n",
    "```\n",
    "exclude_dims : set, optional\n",
    "        Core dimensions on the inputs to exclude from alignment and\n",
    "        broadcasting entirely. Any input coordinates along these dimensions\n",
    "        will be dropped. Each excluded dimension must also appear in\n",
    "        ``input_core_dims`` for at least one argument. Only dimensions listed\n",
    "        here are allowed to change size between input and output objects.\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(time=0, lon=0),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Core dimensions\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Core dimensions are central to using `apply_ufunc`. In our case, our function expects to receive a 1D vector along `lat` &#x2014; this is the dimension that is \"core\" to the function's functionality. Multiple core dimensions are possible. `apply_ufunc` needs to know which dimensions of each variable are core dimensions.\n",
    "\n",
    "    input_core_dims : Sequence[Sequence], optional\n",
    "        List of the same length as ``args`` giving the list of core dimensions\n",
    "        on each input argument that should not be broadcast. By default, we\n",
    "        assume there are no core dimensions on any input arguments.\n",
    "\n",
    "        For example, ``input_core_dims=[[], ['time']]`` indicates that all\n",
    "        dimensions on the first argument and all dimensions other than 'time'\n",
    "        on the second argument should be broadcast.\n",
    "\n",
    "        Core dimensions are automatically moved to the last axes of input\n",
    "        variables before applying ``func``, which facilitates using NumPy style\n",
    "        generalized ufuncs [2]_.\n",
    "        \n",
    "    output_core_dims : List[tuple], optional\n",
    "        List of the same length as the number of output arguments from\n",
    "        ``func``, giving the list of core dimensions on each output that were\n",
    "        not broadcast on the inputs. By default, we assume that ``func``\n",
    "        outputs exactly one array, with axes corresponding to each broadcast\n",
    "        dimension.\n",
    "\n",
    "        Core dimensions are assumed to appear as the last dimensions of each\n",
    "        output in the provided order.\n",
    "        \n",
    "Next we specify `\"lat\"` as `input_core_dims` on both `air` and `air.lat`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(time=0, lon=0),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], []],\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "xarray is telling us that it expected to receive back a numpy array with 0 dimensions but instead received an array with 1 dimension corresponding to `newlat`. We can fix this by specifying `output_core_dims`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(time=0, lon=0),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], []],  # list with one entry per arg\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally we get some output! Let's check that this is right\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(time=0, lon=0),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], []],  # list with one entry per arg\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    ")\n",
    "interped[\"lat\"] = newlat  # need to add this manually\n",
    "xr.testing.assert_allclose(expected.isel(time=0, lon=0), interped)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No errors are raised so it is right!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Automatic vectorization with `np.vectorize`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now our function currently only works on one vector of data which is not so useful given our 3D dataset.\n",
    "Let's try passing the whole dataset. We add a `print` statement so we can see what our function receives."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def interp1d_np(data, x, xi):\n",
    "    print(f\"data: {data.shape} | x: {x.shape} | xi: {xi.shape}\")\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.isel(lon=slice(3), time=slice(4)),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,\n",
    "    newlat,\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], []],  # list with one entry per arg\n",
    "    output_core_dims=[[\"lat\"]],\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "That's a hard-to-interpret error but our `print` call helpfully printed the shapes of the input data: \n",
    "\n",
    "    data: (10, 53, 25) | x: (25,) | xi: (100,)\n",
    "\n",
    "We see that `air` has been passed as a 3D numpy array which is not what `np.interp` expects. Instead we want loop over all combinations of `lon` and `time`; and apply our function to each corresponding vector of data along `lat`.\n",
    "\n",
    "\n",
    "`apply_ufunc` makes this easy by specifying `vectorize=True`:\n",
    "\n",
    "    vectorize : bool, optional\n",
    "        If True, then assume ``func`` only takes arrays defined over core\n",
    "        dimensions as input and vectorize it automatically with\n",
    "        :py:func:`numpy.vectorize`. This option exists for convenience, but is\n",
    "        almost always slower than supplying a pre-vectorized function.\n",
    "        Using this option requires NumPy version 1.12 or newer.\n",
    "        \n",
    "```{caution}\n",
    "The documentation for [`np.vectorize`](https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html) points out that\n",
    "\"The vectorize function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def interp1d_np(data, x, xi):\n",
    "    print(f\"data: {data.shape} | x: {x.shape} | xi: {xi.shape}\")\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air,  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,  # as above\n",
    "    newlat,  # as above\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], []],  # list with one entry per arg\n",
    "    output_core_dims=[[\"lat\"]],  # returned data has one dimension\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be set!\n",
    "    vectorize=True,  # loop over non-core dims\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This unfortunately is another cryptic error from numpy. \n",
    "\n",
    "Notice that `newlat` is not an xarray object. Let's add a dimension name `new_lat` and modify the call. Note this cannot be `lat` because xarray expects dimensions to be the same size (or broadcastable) among all inputs. `output_core_dims` needs to be modified appropriately. We'll manually rename `new_lat` back to `lat` for easy checking."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def interp1d_np(data, x, xi):\n",
    "    print(f\"data: {data.shape} | x: {x.shape} | xi: {xi.shape}\")\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air,  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,  # as above\n",
    "    newlat,  # as above\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"new_lat\"]],  # list with one entry per arg\n",
    "    output_core_dims=[[\"new_lat\"]],  # returned data has one dimension\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be a set!\n",
    "    vectorize=True,  # loop over non-core dims\n",
    ")\n",
    "interped = interped.rename({\"new_lat\": \"lat\"})\n",
    "interped[\"lat\"] = newlat  # need to add this manually\n",
    "xr.testing.assert_allclose(\n",
    "    expected.transpose(*interped.dims), interped\n",
    ")  # order of dims is different\n",
    "interped"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice that the printed input shapes are all 1D and correspond to one vector along the `lat` dimension.\n",
    "\n",
    "The result is now an xarray object with coordinate values copied over from `data`. This is why `apply_ufunc` is so convenient; it takes care of a lot of boilerplate necessary to apply functions that consume and produce numpy arrays to xarray objects.\n",
    "\n",
    "One final point: `lat` is now the *last* dimension in `interped`. This is a \"property\" of core dimensions: they are moved to the end before being sent to `interp1d_np` as was noted in the docstring for `input_core_dims`\n",
    "\n",
    "        Core dimensions are automatically moved to the last axes of input\n",
    "        variables before applying ``func``, which facilitates using NumPy style\n",
    "        generalized ufuncs [2]_."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parallelization with dask\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "So far our function can only handle numpy arrays. A real benefit of `apply_ufunc` is the ability to easily parallelize over dask chunks _when needed_. \n",
    "\n",
    "We want to apply this function in a vectorized fashion over each chunk of the dask array. This is possible using dask's `blockwise`, `map_blocks`, or `apply_gufunc`. Xarray's `apply_ufunc` wraps dask's `apply_gufunc` and asking it to map the function over chunks using `apply_gufunc` is as simple as specifying `dask=\"parallelized\"`. With this level of flexibility we need to provide dask with some extra information: \n",
    "  1. `output_dtypes`: dtypes of all returned objects, and \n",
    "  2. `output_sizes`: lengths of any new dimensions. \n",
    "  \n",
    "Here we need to specify `output_dtypes` since `apply_ufunc` can infer the size of the new dimension `new_lat` from the argument corresponding to the third element in `input_core_dims`. \n",
    "\n",
    "Here I choose the chunk sizes to illustrate that `np.vectorize` is still applied so that our function receives 1D vectors even though the blocks are 3D."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def interp1d_np(data, x, xi):\n",
    "    print(f\"data: {data.shape} | x: {x.shape} | xi: {xi.shape}\")\n",
    "    return np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np,  # first the function\n",
    "    air.chunk({\"time\": 2, \"lon\": 2}),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,  # as above\n",
    "    newlat,  # as above\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"new_lat\"]],  # list with one entry per arg\n",
    "    output_core_dims=[[\"new_lat\"]],  # returned data has one dimension\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be a set!\n",
    "    vectorize=True,  # loop over non-core dims\n",
    "    dask=\"parallelized\",\n",
    "    output_dtypes=[air.dtype],  # one per output\n",
    ").rename({\"new_lat\": \"lat\"})\n",
    "interped[\"lat\"] = newlat  # need to add this manually\n",
    "xr.testing.assert_allclose(expected.transpose(*interped.dims), interped)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Yay! our function is receiving 1D vectors, so we've successfully parallelized applying a 1D function over a block. If you have a distributed dashboard up, you should see computes happening as equality is checked.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## High performance vectorization: gufuncs, numba & guvectorize\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "`np.vectorize` is a very convenient function but is unfortunately slow. It is only marginally faster than writing a for loop in Python and looping. A common way to get around this is to write a base interpolation function that can handle nD arrays in a compiled language like Fortran and then pass that to `apply_ufunc`.\n",
    "\n",
    "Another option is to use the numba package which provides a very [convenient `guvectorize` decorator](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator). Any decorated function gets compiled and will loop over any non-core dimension in parallel when necessary. \n",
    "\n",
    "We need to specify some extra information:\n",
    "\n",
    "   1. Our function cannot return a variable any more. Instead it must receive a variable (the last argument) whose contents the function will modify. So we change from `def interp1d_np(data, x, xi)` to `def interp1d_np_gufunc(data, x, xi, out)`. Our computed results must be assigned to `out`. All values of `out` must be assigned explicitly.\n",
    "   \n",
    "   2. `guvectorize` needs to know the dtypes of the input and output. This is specified in string form as the first argument. Each element of the tuple corresponds to each argument of the function. In this case, we specify `float64` for all inputs and outputs: `\"(float64[:], float64[:], float64[:], float64[:])\"` corresponding to `data, x, xi, out`\n",
    "   \n",
    "   3. Now we need to tell numba the size of the dimensions the function takes as inputs and returns as output i.e. _core dimensions_. This is done in symbolic form i.e. `data` and `x` are vectors of the same length, say `n`; `xi` and the output `out` have a different length, say `m`. So the second argument is (again as a string)\n",
    "         `\"(n), (n), (m) -> (m).\"` corresponding again to `data, x, xi, out`\n",
    "         \n",
    "```{seealso}\n",
    "\n",
    "Read the [numba documentation](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) for more details.\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from numba import float64, guvectorize\n",
    "\n",
    "\n",
    "@guvectorize(\"(float64[:], float64[:], float64[:], float64[:])\", \"(n), (n), (m) -> (m)\")\n",
    "def interp1d_np_gufunc(data, x, xi, out):\n",
    "    # numba doesn't really like this.\n",
    "    print(\"data: \" + str(data.shape) + \" | x:\" + str(x.shape) + \" | xi: \" + str(xi.shape))\n",
    "    out[:] = np.interp(xi, x, data)\n",
    "    # gufuncs don't return data\n",
    "    # instead you assign to a the last arg\n",
    "    # return np.interp(xi, x, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "The warnings are about [object-mode compilation](https://numba.readthedocs.io/en/stable/user/performance-tips.html) relating to the `print` statement. This means we don't get much speed up. We'll keep the `print` statement temporarily to make sure that `guvectorize` acts like we want it to."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interped = xr.apply_ufunc(\n",
    "    interp1d_np_gufunc,  # first the function\n",
    "    air.chunk({\"time\": 2, \"lon\": 2}),  # now arguments in the order expected by 'interp1_np'\n",
    "    air.lat,  # as above\n",
    "    newlat,  # as above\n",
    "    input_core_dims=[[\"lat\"], [\"lat\"], [\"new_lat\"]],  # list with one entry per arg\n",
    "    output_core_dims=[[\"new_lat\"]],  # returned data has one dimension\n",
    "    exclude_dims=set((\"lat\",)),  # dimensions allowed to change size. Must be a set!\n",
    "    # vectorize=True,  # not needed since numba takes care of vectorizing\n",
    "    dask=\"parallelized\",\n",
    "    output_dtypes=[air.dtype],  # one per output\n",
    ").rename({\"new_lat\": \"lat\"})\n",
    "interped[\"lat\"] = newlat  # need to add this manually\n",
    "xr.testing.assert_allclose(expected.transpose(*interped.dims), interped)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "Yay! Our function is receiving 1D vectors and is working automatically with dask arrays. \n",
    "\n",
    "Finally let's comment out the print line and wrap everything up in a nice reusable function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import float64, guvectorize\n",
    "\n",
    "\n",
    "@guvectorize(\n",
    "    \"(float64[:], float64[:], float64[:], float64[:])\",\n",
    "    \"(n), (n), (m) -> (m)\",\n",
    "    nopython=True,\n",
    ")\n",
    "def interp1d_np_gufunc(data, x, xi, out):\n",
    "    out[:] = np.interp(xi, x, data)\n",
    "\n",
    "\n",
    "def xr_interp(data, dim, newdim):\n",
    "    interped = xr.apply_ufunc(\n",
    "        interp1d_np_gufunc,  # first the function\n",
    "        data,  # now arguments in the order expected by 'interp1_np'\n",
    "        data[dim],  # as above\n",
    "        newdim,  # as above\n",
    "        input_core_dims=[[dim], [dim], [\"__newdim__\"]],  # list with one entry per arg\n",
    "        output_core_dims=[[\"__newdim__\"]],  # returned data has one dimension\n",
    "        exclude_dims=set((dim,)),  # dimensions allowed to change size. Must be a set!\n",
    "        # vectorize=True,  # not needed since numba takes care of vectorizing\n",
    "        dask=\"parallelized\",\n",
    "        output_dtypes=[data.dtype],  # one per output; could also be float or np.dtype(\"float64\")\n",
    "    ).rename({\"__newdim__\": dim})\n",
    "    interped[dim] = newdim  # need to add this manually\n",
    "\n",
    "    return interped\n",
    "\n",
    "\n",
    "xr.testing.assert_allclose(\n",
    "    expected.transpose(*interped.dims),\n",
    "    xr_interp(air.chunk({\"time\": 2, \"lon\": 2}), \"lat\", newlat),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Summary\n",
    "\n",
    "This technique is generalizable to any 1D function that [can be compiled](https://numba.readthedocs.io/en/stable/reference/pysupported.html#pysupported) by Numba."
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  },
  "nbsphinx": {
   "allow_errors": true
  },
  "org": null,
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: advanced/apply_ufunc/numba-vectorization.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Fast vectorization with Numba\n",
    "\n",
    "<img src=\"https://numba.pydata.org/_static/numba-blue-horizontal-rgb.svg\" width=\"40%\" align=\"right\">"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1",
   "metadata": {
    "tags": []
   },
   "source": [
    "`np.vectorize` is a very convenient function but is unfortunately slow. It is only marginally faster than writing a for loop in Python and looping. \n",
    "\n",
    "A common way to get around this is to write a base interpolation function that can handle nD arrays in a compiled language like C or Fortran and then pass that to `apply_ufunc`.\n",
    "\n",
    "Another option is to use the [numba package](https://numba.pydata.org/) which provides two very convenient decorators to build [numpy universal functions or ufuncs](https://numba.readthedocs.io/en/stable/user/vectorize.html):\n",
    "1. [`vectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-vectorize-decorator) for functions that act on scalars, and \n",
    "2. [`guvectorize`](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator) for functions that operates on subsets of the array along core-dimensions. Any decorated function gets compiled and will loop over the loop dimensions in parallel when necessary. \n",
    "\n",
    "For `apply_ufunc` the key concept is that we must provide `vectorize=False` (the default) when using Numba vectorized functions. \n",
    "Numba handles the vectorization (or looping) and `apply_ufunc` handles converting Xarray objects to bare arrays and handling metadata."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import numpy as np\n",
    "import xarray as xr\n",
    "\n",
    "da = xr.DataArray(\n",
    "    np.arange(12).reshape(3, 4),\n",
    "    dims=(\"x\", \"y\"),\n",
    "    coords={\"x\": [12, 13, 14]},\n",
    "    attrs={\"foo\": \"bar\"},\n",
    ")\n",
    "da"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {},
   "source": [
    "## `vectorize`\n",
    "\n",
    "Our `squared_error` example from earlier works element-by-element, and is a great example for `vectorize`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from numba import vectorize, float64\n",
    "\n",
    "\n",
    "@vectorize([float64(float64, float64)])\n",
    "def squared_error(x, y):\n",
    "    return (x - y) ** 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "See the numba documentation to understand `@vectorize([float64(float64, float64)])`\n",
    "\n",
    "Now use `apply_ufunc` to apply it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(squared_error, da, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8",
   "metadata": {},
   "source": [
    "## `guvectorize`\n",
    "\n",
    "`guvectorize` is for functions that work on small subsets of the data. Quoting the Numba documentation\n",
    "> While `vectorize()` allows you to write ufuncs that work on one element at a time, the `guvectorize()` decorator takes the concept one step further and allows you to write ufuncs that will work on an arbitrary number of elements of input arrays, and take and return arrays of differing dimensions. The typical example is a running median or a convolution filter.\n",
    "\n",
    "This description should remind you of `apply_ufunc`!\n",
    "\n",
    "We will use the example function `g` from the [numba docs](https://numba.readthedocs.io/en/stable/user/vectorize.html#the-guvectorize-decorator), which adds a scalar `y` to a 1D vector `x`. The `res` argument here will contain the output (this is a Numba detail).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from numba import guvectorize, int64\n",
    "\n",
    "\n",
    "@guvectorize([(int64[:], int64, int64[:])], '(n),()->(n)')\n",
    "def g(x, y, res):\n",
    "    for i in range(x.shape[0]):\n",
    "        res[i] = x[i] + y\n",
    "\n",
    "\n",
    "a = np.arange(5)\n",
    "g(a, 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10",
   "metadata": {},
   "source": [
    "Unlike `squared_error` we cannot pass an Xarray object to `g` directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "g(da, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12",
   "metadata": {},
   "source": [
    "Now use `apply_ufunc`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(\n",
    "    g,\n",
    "    da,\n",
    "    1,\n",
    "    input_core_dims=[[\"x\"], []],\n",
    "    output_core_dims=[[\"x\"]],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {},
   "source": [
    "Notice the following:\n",
    "1. The `guvectorize` decorator includes the concept of \"core dimensions\": `'(n),()->(n)'`. This string means that the `g` takes a 1D vector of size `n`, a scalar, and returns a 1D vector of size `n`. There is one core dimension for the input, and one core dimension for the output. Both core dimensions have the same size.\n",
    "2. That string translates to `input_core_dims=[[\"x\"], []], output_core_dims=[[\"x\"]]` in `apply_ufunc`.\n",
    "3. We don't provide `vectorize=True` to `apply_ufunc` since `numba` will handle the vectorization in compiled code automatically."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15",
   "metadata": {},
   "source": [
    "## With dask\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {},
   "source": [
    "Use the chunked DataArray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "da_dask = da.chunk({\"y\": 1})\n",
    "da_dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18",
   "metadata": {},
   "source": [
    "::::{admonition} Exercise\n",
    ":class: tip\n",
    "\n",
    "Apply `g` to `da_dask`\n",
    "\n",
    ":::{admonition} Solution\n",
    ":class: dropdown\n",
    "\n",
    "```python\n",
    "xr.apply_ufunc(\n",
    "    g,\n",
    "    da_dask, \n",
    "    1, \n",
    "    input_core_dims=[[\"x\"], []], \n",
    "    output_core_dims=[[\"x\"]],\n",
    "    dask=\"parallelized\",\n",
    ")\n",
    "```\n",
    ":::\n",
    "::::"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {},
   "source": [
    "## Next\n",
    "\n",
    "For more, see the numpy.interp end-to-end example in the left sidebar."
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/apply_ufunc/simple_numpy_apply_ufunc.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0",
   "metadata": {
    "tags": []
   },
   "source": [
    "(gentle-intro)=\n",
    "# A gentle introduction\n",
    "\n",
    "Many, but not all, useful array methods are wrapped by Xarray and accessible\n",
    "as methods on Xarray objects. For example `DataArray.mean` calls `numpy.nanmean`.\n",
    "A very common use-case is to apply functions that expect and return NumPy \n",
    "(or other array types) on Xarray objects.  For example, this would include all of SciPy's API. \n",
    "Applying many of these functions to Xarray object involves a series of repeated steps.\n",
    "`apply_ufunc` provides a convenient wrapper function that generalizes the steps\n",
    "involved in applying such functions to Xarray objects.\n",
    "\n",
    "```{tip}\n",
    "Xarray uses `apply_ufunc` internally to implement much of its API, meaning that it is quite powerful!\n",
    "```\n",
    "\n",
    "Our goals are to learn that `apply_ufunc` automates aspects of applying computation functions that are designed for pure arrays (like numpy arrays) on xarray objects including\n",
    "- Propagating dimension names, coordinate variables, and (optionally) attributes.\n",
    "- Handle Dataset input by looping over data variables.\n",
    "- Allow passing arbitrary positional and keyword arguments\n",
    "\n",
    "\n",
    "```{tip}\n",
    "We'll reduce the length of error messages using `%xmode minimal` See the [ipython documentation](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-xmode) for details.\n",
    "```\n",
    "\n",
    "\n",
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "%xmode minimal\n",
    "\n",
    "import numpy as np\n",
    "import xarray as xr\n",
    "\n",
    "# limit the amount of information printed to screen\n",
    "xr.set_options(display_expand_data=False)\n",
    "np.set_printoptions(threshold=10, edgeitems=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2",
   "metadata": {},
   "source": [
    "Let's load a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = xr.tutorial.load_dataset(\"air_temperature\")\n",
    "ds"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4",
   "metadata": {
    "tags": []
   },
   "source": [
    "## A simple example: pure numpy\n",
    "\n",
    "Simple functions that act independently on each value should work without any\n",
    "additional arguments. \n",
    "\n",
    "Consider the following `squared_error` function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {
    "tags": [
     "raises-exception"
    ]
   },
   "outputs": [],
   "source": [
    "def squared_error(x, y):\n",
    "    return (x - y) ** 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6",
   "metadata": {},
   "source": [
    "````{tip}\n",
    "\n",
    "This function uses only arithmetic operations. For such simple functions, you can pass Xarray objects directly and receive Xarray objects back.\n",
    "Try\n",
    "```python\n",
    "squared_error(ds.air, 1)\n",
    "```\n",
    "\n",
    "We use it here as a very simple example\n",
    "````"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7",
   "metadata": {},
   "source": [
    "We can apply `squared_error` manually by extracting the underlying numpy array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {},
   "outputs": [],
   "source": [
    "numpy_result = squared_error(ds.air.data, 1)\n",
    "numpy_result"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9",
   "metadata": {},
   "source": [
    "To convert this result to a DataArray, we could do it manually"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {},
   "outputs": [],
   "source": [
    "xr.DataArray(\n",
    "    data=numpy_result,\n",
    "    # propagate all the Xarray metadata manually\n",
    "    dims=ds.air.dims,\n",
    "    coords=ds.air.coords,\n",
    "    attrs=ds.air.attrs,\n",
    "    name=ds.air.name,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "11",
   "metadata": {},
   "source": [
    "A shorter version uses [DataArray.copy](https://docs.xarray.dev/en/stable/generated/xarray.DataArray.copy.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.air.copy(data=numpy_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13",
   "metadata": {
    "tags": []
   },
   "source": [
    "```{caution}\n",
    "Using `DataArray.copy` works for such simple cases but doesn't generalize that well. \n",
    "\n",
    "For example, consider a function that removed one dimension and added a new dimension.\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14",
   "metadata": {
    "tags": []
   },
   "source": [
    "## apply_ufunc\n",
    "\n",
    "`apply_ufunc` can handle more complicated functions. Here's how to use it with `squared_error`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "xr.apply_ufunc(squared_error, ds.air, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16",
   "metadata": {
    "tags": []
   },
   "source": [
    "## How does apply_ufunc work?\n",
    "\n",
    "\n",
    "This line\n",
    "```python\n",
    "xr.apply_ufunc(squared_error, ds.air, 1)\n",
    "```\n",
    "is equivalent to `squared_error(ds.air.data, 1)` with automatic propagation of xarray metadata like dimension names, coordinate values etc.\n",
    "\n",
    "\n",
    "To illustrate how `apply_ufunc` works, let us write a small wrapper function. This will let us examine what data is received and returned from the applied function. \n",
    "\n",
    "```{tip}\n",
    "This trick is very useful for debugging\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "def wrapper(x, y):\n",
    "    print(f\"received x of type {type(x)}, shape {x.shape}\")\n",
    "    print(f\"received y of type {type(y)}\")\n",
    "    return squared_error(x, y)\n",
    "\n",
    "\n",
    "xr.apply_ufunc(wrapper, ds.air, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18",
   "metadata": {
    "tags": []
   },
   "source": [
    "We see that `wrapper` receives the underlying numpy array (`ds.air.data`), and the integer `1`. \n",
    "\n",
    "Essentially, `apply_ufunc` does the following:\n",
    "1. extracts the underlying array data (`.data`), \n",
    "2. passes it to the user function, \n",
    "3. receives the returned values, and \n",
    "4. then wraps that back up as a DataArray\n",
    "\n",
    "```{tip}\n",
    "`apply_ufunc` always takes in at least one DataArray or Dataset and returns one DataArray or Dataset\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19",
   "metadata": {},
   "source": [
    "## Handling attributes\n",
    "\n",
    "By default, attributes are omitted since they may now be inaccurate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "result = xr.apply_ufunc(wrapper, ds.air, 1)\n",
    "result.attrs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21",
   "metadata": {},
   "source": [
    "To propagate attributes, pass `keep_attrs=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "result = xr.apply_ufunc(wrapper, ds.air, 1, keep_attrs=True)\n",
    "result.attrs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "23",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Handling datasets\n",
    "\n",
    "`apply_ufunc` easily handles both DataArrays and Datasets. \n",
    "\n",
    "When passed a Dataset, `apply_ufunc` will loop over the data variables and sequentially pass those to `squared_error`.\n",
    "\n",
    "So `squared_error` always receives a _single_ numpy array.\n",
    "\n",
    "To illustrate that lets create a new `Dataset` with two arrays. We'll create a new array `air2` that is 2D `time, lat`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ds2 = ds.copy()\n",
    "ds2[\"air2\"] = ds2.air.isel(lon=0) ** 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25",
   "metadata": {},
   "source": [
    "We see that `wrapper` is called twice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "xr.apply_ufunc(wrapper, ds2, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "xr.apply_ufunc(squared_error, ds2, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Passing positional and keyword arguments\n",
    "\n",
    "```{seealso}\n",
    "See the Python tutorial on [defining functions](https://docs.python.org/3/tutorial/controlflow.html#defining-functions) for more on positional and keyword arguments.\n",
    "```\n",
    "\n",
    "`squared_error` takes two arguments named `x` and `y`.\n",
    "\n",
    "In `xr.apply_ufunc(squared_error, ds.air, 1)`, the value of `1` for `y` was passed positionally. \n",
    "\n",
    "to use the keyword argument form, pass it using the `kwargs` keyword argument to `apply_ufunc`\n",
    "> kwargs (dict, optional) – Optional keyword arguments passed directly on to call func."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "xr.apply_ufunc(squared_error, ds.air, kwargs={\"y\": 1})"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: advanced/backends/1.Backend_without_Lazy_Loading.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Binary data without lazy loading\n",
    "\n",
    "\n",
    "**Author**: Aureliana Barghini ([B-Open](https://www.bopen.eu/))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BackendEntrypoint\n",
    "Implement a subclass of `BackendEntrypoint` that expose a method `open_dataset`:\n",
    "\n",
    "```python\n",
    "from xarray.backends import BackendEntrypoint\n",
    "\n",
    "class MyBackendEntrypoint(BackendEntrypoint):\n",
    "    def open_dataset(\n",
    "        self,\n",
    "        filename_or_obj,\n",
    "        *,\n",
    "        drop_variables=None,\n",
    "    ):\n",
    "\n",
    "        return my_open_dataset(filename_or_obj, drop_variables=drop_variables)\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BackendEntrypoint integration\n",
    "Declare this class as an external plugin in your `setup.py`:\n",
    "\n",
    "```python\n",
    "setuptools.setup(\n",
    "    entry_points={\n",
    "        'xarray.backends': ['engine_name=package.module:my_backendentrypoint'],\n",
    "    },\n",
    ")\n",
    "```\n",
    "or pass it in `xr.open_dataset`:\n",
    "\n",
    "```python\n",
    "xr.open_dataset(filename, engine=MyBackendEntrypoint)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example backend for binary files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import xarray as xr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create sample files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(30000000, dtype=np.int64)\n",
    "with open(\"foo.bin\", \"w\") as f:\n",
    "    arr.tofile(f)\n",
    "\n",
    "arr = np.arange(30000000, dtype=np.float64)\n",
    "with open(\"foo_float.bin\", \"w\") as f:\n",
    "    arr.tofile(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define the entrypoint\n",
    "Example of backend to open binary files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BinaryBackend(xr.backends.BackendEntrypoint):\n",
    "    def open_dataset(\n",
    "        self,\n",
    "        filename_or_obj,\n",
    "        *,\n",
    "        drop_variables=None,\n",
    "        # backend specific parameter\n",
    "        dtype=np.int64,\n",
    "    ):\n",
    "        with open(filename_or_obj) as f:\n",
    "            arr = np.fromfile(f, dtype)\n",
    "\n",
    "        var = xr.Variable(dims=(\"x\"), data=arr)\n",
    "        coords = {\"x\": np.arange(arr.size) * 10}\n",
    "        return xr.Dataset({\"foo\": var}, coords=coords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### It Works! \n",
    "But it may be memory demanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = xr.open_dataarray(\"foo.bin\", engine=BinaryBackend)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = xr.open_dataarray(\"foo_float.bin\", engine=BinaryBackend, dtype=np.float64)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.sel(x=slice(0, 100))"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: advanced/backends/2.Backend_with_Lazy_Loading.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Binary data with lazy loading\n",
    "\n",
    "**Author**: Aureliana Barghini ([B-Open](https://www.bopen.eu/))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you want to make your backend effective with big datasets, then you should\n",
    "support lazy loading. <br/>\n",
    "For doing that you need:\n",
    "-  Implement `_raw_indexing_method` for reading blocks form disk<br/><br/>\n",
    "- Implement some glue code to make it work with Xarray:<br/><br/>\n",
    "    - put your `_raw_indexing_method` in a `BackendArray` subclass <br/><br/>\n",
    "    - replace the `numpy.ndarray` inside your **dataset** with your subclass of `BackendArray`\n",
    "    \n",
    "    \n",
    "<br/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create sample files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import dask\n",
    "import numpy as np\n",
    "import xarray as xr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = np.arange(30000000, dtype=np.int64)\n",
    "with open(\"foo.bin\", \"w\") as f:\n",
    "    arr.tofile(f)\n",
    "\n",
    "arr = np.arange(30000000, dtype=np.float64)\n",
    "with open(\"foo_float.bin\", \"w\") as f:\n",
    "    arr.tofile(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BinaryBackendArray"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The BackendArray subclass shall implement the following method and attributes:\n",
    "\n",
    "- `_raw_indexing_method` method, supporting **item selection** and **slicing**\n",
    "\n",
    "- `__getitem__` that wraps `_raw_indexing_method` with an xarray helper function `explicit_indexing_adapter` (threadsafe)\n",
    "\n",
    "- `shape` attribute\n",
    "\n",
    "- `dtype` attribute.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BinaryBackendArray(xr.backends.BackendArray):\n",
    "    def __init__(\n",
    "        self,\n",
    "        filename_or_obj,\n",
    "        shape,\n",
    "        dtype,\n",
    "        lock,\n",
    "    ):\n",
    "        self.filename_or_obj = filename_or_obj\n",
    "        self.shape = shape\n",
    "        self.dtype = dtype\n",
    "        self.lock = lock\n",
    "\n",
    "    def __getitem__(self, key: tuple):\n",
    "        return xr.core.indexing.explicit_indexing_adapter(\n",
    "            key,\n",
    "            self.shape,\n",
    "            xr.core.indexing.IndexingSupport.BASIC,\n",
    "            self._raw_indexing_method,\n",
    "        )\n",
    "\n",
    "    def _raw_indexing_method(self, key: tuple):\n",
    "        key0 = key[0]\n",
    "        size = np.dtype(self.dtype).itemsize\n",
    "\n",
    "        if isinstance(key0, slice):\n",
    "            start = key0.start or 0\n",
    "            stop = key0.stop or self.shape[0]\n",
    "            offset = size * start\n",
    "            count = stop - start\n",
    "        else:\n",
    "            offset = size * key0\n",
    "            count = 1\n",
    "\n",
    "        with self.lock, open(self.filename_or_obj) as f:\n",
    "            arr = np.fromfile(f, np.int64, offset=offset, count=count)\n",
    "\n",
    "        if isinstance(key, int):\n",
    "            arr = arr.squeeze()\n",
    "\n",
    "        return arr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BinaryBackend Entrypoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BinaryBackend(xr.backends.BackendEntrypoint):\n",
    "    def open_dataset(self, filename_or_obj, *, drop_variables=None, dtype=np.int64):\n",
    "        size = np.dtype(dtype).itemsize\n",
    "        shape = os.stat(filename_or_obj).st_size // size\n",
    "\n",
    "        backend_array = BinaryBackendArray(\n",
    "            filename_or_obj=filename_or_obj,\n",
    "            shape=(shape,),\n",
    "            dtype=dtype,\n",
    "            lock=dask.utils.SerializableLock(),\n",
    "        )\n",
    "        data = xr.core.indexing.LazilyIndexedArray(backend_array)\n",
    "\n",
    "        var = xr.Variable(dims=(\"x\"), data=data)\n",
    "        return xr.Dataset({\"foo\": var})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## Reduced memory usage with dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = xr.open_dataarray(\"foo.bin\", engine=BinaryBackend, chunks=10000)\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.sel(x=slice(0, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.sel(x=slice(0, 10)).compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.load()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: advanced/backends/backends.md
================================================
# Reading data using backends

## Introduction

You can [read different types of files](https://docs.xarray.dev/en/stable/user-guide/io.html) in `xr.open_dataset` by specifying the engine to be used:

```python
import xarray as xr

xr.open_dataset("my_file.grib", engine="cfgrib")
```

Navigating Xarray backends can be confusing,
so we recommend checking out [this flow chart](https://docs.xarray.dev/en/stable/user-guide/io.html)
to help you figure out which engine you need and how to use it.

You can see what backends are currently available in your working environment
with `xarray.backends.list_engines()`.

## Why use the Xarray backend API to write your own backend?

- Your users don't need to learn a new interface; they can use `xr.open_dataset` with the `engine` kwarg.
- With little extra effort you can have lazy loading with Dask. Simply implement a function for reading blocks and Xarray will manage lazy loading with Dask for you.
- It's easy to implement: using the backend API (introduced in v0.18.0), you don't need to integrate any code in Xarray.

## More Information

See the [documentation](https://docs.xarray.dev/en/stable/internals/how-to-add-new-backend.html) for more details on adding and registering a new backend.

Follow the tutorials on creating a new backend for binary files.

```{tableofcontents}

```

### Links to internal backends

- [netcdf4](https://pypi.org/project/netCDF4/) - netCDF4
- [scipy](https://scipy.org/) - netCDF3
- [zarr](https://pypi.org/project/zarr/) - Zarr
- [pydap](https://pydap.github.io/pydap/) - Data Access Protocol (DAP/DODS/OPeNDAP)
- [h5netcdf](https://h5netcdf.org/) - hdf5

### Links to external backends (not comprehensive)

- [cfgrib](https://github.com/ecmwf/cfgrib) - GRIB
- [tiledb](https://github.com/TileDB-Inc/TileDB-CF-Py) - TileDB
- [rioxarray](https://corteva.github.io/rioxarray/stable/) - GeoTIFF, JPEG-2000, ESRI-hdr, etc (via GDAL)
- [xarray-sentinel](https://github.com/bopen/xarray-sentinel) - Sentinel-1 SAFE
- ...


================================================
FILE: advanced/indexing/indexing.md
================================================
# Indexing

```{tableofcontents}

```


================================================
FILE: advanced/indexing/why-trees.md
================================================
---
jupytext:
  formats: ipynb,md:myst
  text_representation:
    extension: .md
    format_name: myst
    format_version: 0.13
    jupytext_version: 1.19.1
kernelspec:
  display_name: Python 3 (ipykernel)
  language: python
  name: python3
---

# Tree-Based Indexing

```{seealso}
[NDPointIndex](https://xarray-indexes.readthedocs.io/blocks/ndpoint.html) — use KD-trees and Ball trees with xarray's indexing system for efficient nearest-neighbor lookups on real datasets.
```

Imagine you have measurements at irregular locations and want to find the **nearest** data point to your query location.

**In this notebook you'll learn:**

- Why naive nearest-neighbor search is slow (O(n) comparisons)
- How KD-trees speed this up dramatically (O(log n) comparisons)
- Why KD-trees can give wrong answers for geographic lat/lon data
- When to use a Ball tree instead

+++

## The nearest neighbor problem in 1D

Let's start with a simple 1D example:

**The problem:** What temperature is it at 4.7 km? We need to find the nearest measurement.

```{code-cell} ipython3
---
tags: [hide-input]
---
import numpy as np
import matplotlib.pyplot as plt

# Temperature measurements at 7 locations along a transect
locations = np.array([1, 3, 4, 7, 8, 9, 12])
temperatures = np.array([15, 18, 17, 22, 24, 23, 19])

# Plot the data
fig, ax = plt.subplots(figsize=(10, 3))
ax.scatter(locations, np.zeros_like(locations), s=100, c='blue', zorder=5)
for loc, temp in zip(locations, temperatures):
    ax.annotate(f'{temp}°', (loc, 0.15), ha='center', fontsize=10)
ax.set_xlim(0, 14)
ax.set_ylim(-0.5, 0.8)
ax.set_xlabel('Location (km)')
ax.set_yticks([])
ax.set_title('Temperature measurements at 7 irregular locations')
plt.tight_layout()
plt.show()
```

The naive approach checks the distance to every point:

```{code-cell} ipython3
---
tags: [hide-input]
---
# === Configuration: change this to explore different queries ===
query = 4.7

# Naive approach: check distance to EVERY point
fig, ax = plt.subplots(figsize=(10, 4))

# Draw the data points on the number line
ax.scatter(locations, np.zeros_like(locations), s=100, c='blue', zorder=5)
ax.scatter(query, 0, s=150, c='red', marker='x', zorder=10, lw=3)
ax.axhline(0, color='black', lw=0.5, zorder=1)

# Draw horizontal distance lines - stacked vertically for visibility
for i, loc in enumerate(locations):
    y_offset = 0.12 * (i + 1)
    # Horizontal line showing the distance
    ax.plot([query, loc], [y_offset, y_offset], 'gray', alpha=0.7, lw=2)
    # Vertical ticks at endpoints
    ax.plot([query, query], [y_offset - 0.03, y_offset + 0.03], 'gray', alpha=0.7, lw=1)
    ax.plot([loc, loc], [y_offset - 0.03, y_offset + 0.03], 'gray', alpha=0.7, lw=1)
    # Label
    ax.annotate(f'{abs(loc - query):.1f} km', ((query + loc)/2, y_offset + 0.04),
                ha='center', fontsize=8, color='gray')

ax.set_xlim(0, 14)
ax.set_ylim(-0.2, 1.1)
ax.set_xlabel('Location (km)')
ax.set_yticks([])
ax.set_title(f'Naive search: compute distance to ALL {len(locations)} points (query={query})')
plt.tight_layout()
plt.show()

print(f"Query: {query} km")
print(f"Nearest point: {locations[np.argmin(np.abs(locations - query))]} km (distance = {np.min(np.abs(locations - query)):.1f} km)")
print(f"Comparisons needed: {len(locations)}")
```

With 7 points this is fine, but with millions of points this becomes slow.

**The solution:** Pre-compute a tree structure that partitions the space. In 1D, this is essentially a binary search tree - each split divides the remaining points in half:

```{code-cell} ipython3
---
tags: [hide-input]
---
from scipy.spatial import KDTree
from matplotlib.patches import Rectangle

# === Configuration ===

# Build the tree (this is the pre-computation step)
tree = KDTree(locations.reshape(-1, 1))

# Query the tree first to get the result
dist, idx = tree.query([[query]])
nearest = locations[idx[0]]

# Map from value to node name for finding the result node
value_to_node = {1: 'LL', 3: 'L1', 4: 'LR', 7: 'root', 8: 'RL', 9: 'R1', 12: 'RR'}
found_node = value_to_node[nearest]

# Determine the search path based on query value
if query < 7:
    if query < 3:
        path_nodes = ['root', 'L1', 'LL']
        regions = [(0, 14), (0, 7), (0, 3)]
    else:
        path_nodes = ['root', 'L1', 'LR']
        regions = [(0, 14), (0, 7), (3, 7)]
else:
    if query < 9:
        path_nodes = ['root', 'R1', 'RL']
        regions = [(0, 14), (7, 14), (7, 9)]
    else:
        path_nodes = ['root', 'R1', 'RR']
        regions = [(0, 14), (7, 14), (9, 14)]

# Create visualization: tree on left, 3 narrowing steps on right
fig = plt.figure(figsize=(16, 9))

# Left side: Tree diagram with spatial ranges
ax_tree = fig.add_subplot(1, 2, 1)
ax_tree.set_xlim(0, 16)
ax_tree.set_ylim(-0.5, 5.5)
ax_tree.axis('off')
ax_tree.set_title('KD-tree structure\n(each node shows the spatial range it covers)', fontsize=12, fontweight='bold')

# Tree node positions - now includes spatial range for each node
nodes = {
    'root': {'pos': (8, 4.5), 'value': 7, 'color': 'steelblue', 'label': 'split=7', 'range': '[0, 14]'},
    'L1': {'pos': (4, 2.6), 'value': 3, 'color': 'coral', 'label': 'split=3', 'range': '[0, 7)'},
    'R1': {'pos': (12, 2.6), 'value': 9, 'color': 'seagreen', 'label': 'split=9', 'range': '[7, 14]'},
    'LL': {'pos': (2, 0.8), 'value': 1, 'color': 'gray', 'label': '1', 'range': '[0, 3)'},
    'LR': {'pos': (6, 0.8), 'value': 4, 'color': 'gray', 'label': '4', 'range': '[3, 7)'},
    'RL': {'pos': (10, 0.8), 'value': 8, 'color': 'gray', 'label': '8', 'range': '[7, 9)'},
    'RR': {'pos': (14, 0.8), 'value': 12, 'color': 'gray', 'label': '12', 'range': '[9, 14]'},
}

# Draw edges
edges = [('root', 'L1'), ('root', 'R1'), ('L1', 'LL'), ('L1', 'LR'), ('R1', 'RL'), ('R1', 'RR')]
for parent, child in edges:
    px, py = nodes[parent]['pos']
    cx, cy = nodes[child]['pos']
    ax_tree.plot([px, cx], [py, cy], 'k-', lw=2, zorder=1)

# Draw nodes with spatial range labels
for name, node in nodes.items():
    x, y = node['pos']
    is_split = 'split' in node['label']
    size = 2200 if is_split else 1500
    ax_tree.scatter(x, y, s=size, c=node['color'], zorder=5, edgecolors='black', linewidths=2)
    ax_tree.annotate(node['label'], (x, y), ha='center', va='center',
                     fontsize=11 if is_split else 10, fontweight='bold', color='white')
    # Add range label below each node
    ax_tree.annotate(node['range'], (x, y - 0.55), ha='center', va='top',
                     fontsize=9, color='black', style='italic',
                     bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='gray', alpha=0.8))

# Highlight the path taken
for i in range(len(path_nodes) - 1):
    px, py = nodes[path_nodes[i]]['pos']
    cx, cy = nodes[path_nodes[i+1]]['pos']
    ax_tree.plot([px, cx], [py, cy], 'r-', lw=5, alpha=0.4, zorder=2)

# Add query annotation
ax_tree.annotate(f'query={query}', (8, 4.5), xytext=(11, 5.2),
                 fontsize=11, color='red', fontweight='bold',
                 arrowprops=dict(arrowstyle='->', color='red', lw=2))

# Mark the found node
found_x, found_y = nodes[found_node]['pos']
ax_tree.annotate(f'found {nearest}!', (found_x + 1.2, found_y + 0.3), fontsize=11, ha='left', color='red', fontweight='bold')

# Right side: 3 subplots showing narrowing search space
steps = [
    ("Step 1: Start with all points", regions[0], 'steelblue', f'{query} < 7? → go left' if query < 7 else f'{query} > 7? → go right'),
    ("Step 2: After first split", regions[1], 'coral', f'{query} < 3? → go left' if query < 3 else f'{query} > 3? → go right' if query < 7 else f'{query} < 9? → go left' if query < 9 else f'{query} > 9? → go right'),
    (f"Step 3: Found nearest = {nearest}", regions[2], 'gold', None),
]

for i, (title, (region_start, region_end), color, annotation) in enumerate(steps):
    ax = fig.add_subplot(3, 2, 2*(i+1))

    # Draw all data points
    for loc in locations:
        in_region = region_start <= loc <= region_end
        ax.scatter(loc, 0, s=100 if in_region else 60,
                   c='blue' if in_region else 'lightgray',
                   zorder=5, edgecolors='black' if in_region else 'gray', linewidths=1)
        if in_region:
            ax.annotate(f'{loc}', (loc, -0.25), ha='center', fontsize=9, fontweight='bold')

    # Draw query point
    ax.scatter(query, 0, s=150, c='red', marker='x', zorder=10, lw=3)

    # Highlight the active region
    rect = Rectangle((region_start, -0.15), region_end - region_start, 0.3,
                      fill=True, facecolor=color, alpha=0.2, edgecolor=color, lw=2, zorder=2)
    ax.add_patch(rect)

    # Draw split lines
    if i == 0:
        ax.axvline(7, color='steelblue', lw=2, ls='--', alpha=0.8)
        ax.annotate('split=7', (7, 0.25), ha='center', fontsize=9, color='steelblue', fontweight='bold')
    elif i == 1:
        if query < 7:
            ax.axvline(3, color='coral', lw=2, ls='--', alpha=0.8)
            ax.annotate('split=3', (3, 0.25), ha='center', fontsize=9, color='coral', fontweight='bold')
        else:
            ax.axvline(9, color='seagreen', lw=2, ls='--', alpha=0.8)
            ax.annotate('split=9', (9, 0.25), ha='center', fontsize=9, color='seagreen', fontweight='bold')

    # Add decision annotation
    if annotation:
        ax.annotate(annotation, (0.98, 0.95), xycoords='axes fraction', ha='right', va='top',
                    fontsize=10, color='darkgreen', fontweight='bold',
                    bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='green', alpha=0.8))

    ax.set_xlim(-0.5, 14.5)
    ax.set_ylim(-0.4, 0.45)
    ax.set_title(title, fontsize=11, fontweight='bold')
    ax.set_yticks([])
    if i == 2:
        ax.set_xlabel('Location (km)', fontsize=10)

plt.tight_layout()
plt.show()

print(f"Nearest point: {nearest} km")
print(f"Comparisons needed: ~{len(path_nodes)} (log₂({len(locations)}) ≈ 3)")
```

## Extending to 2D

The same idea works in higher dimensions. Now our measurements are scattered across a 2D area:

```{code-cell} ipython3
---
tags: [hide-input]
---
# 2D example: temperature measurements scattered across an area
from matplotlib.patches import Rectangle

np.random.seed(42)
points_2d = np.random.rand(20, 2) * 10  # 20 points in a 10x10 area

# === Configuration ===
query_2d = np.array([6.5, 4.0])  # Change this to query a different location

# Build tree - using leafsize=2 to demonstrate meaningful subdivision
# (default leafsize=10 would barely split with only 20 points!)
LEAFSIZE = 2
tree_2d = KDTree(points_2d, leafsize=LEAFSIZE)
dist, idx = tree_2d.query([query_2d])
nearest_2d = points_2d[idx[0]]

# With leafsize=2, we get ~4 levels of splits (log2(20/2) ≈ 3-4)
# Let's show the first 2 splits conceptually, then the final leaf comparison

# Approximate the splits (KD-tree alternates x, y, x, y...)
x_split = np.median(points_2d[:, 0])  # ~4.0

# Determine which half based on query x
if query_2d[0] >= x_split:
    # Right half
    half_points = points_2d[points_2d[:, 0] >= x_split]
    x_decision = f"x={query_2d[0]} > {x_split:.1f}? → go right"
    x_region = (x_split, 0, 10, 10)  # (x_min, y_min, x_max, y_max)
else:
    # Left half
    half_points = points_2d[points_2d[:, 0] < x_split]
    x_decision = f"x={query_2d[0]} < {x_split:.1f}? → go left"
    x_region = (0, 0, x_split, 10)

y_split = np.median(half_points[:, 1])

# Determine which quadrant based on query y
if query_2d[1] >= y_split:
    # Upper region
    y_decision = f"y={query_2d[1]} > {y_split:.1f}? → go up"
    if query_2d[0] >= x_split:
        final_region = (x_split, y_split, 10, 10)  # top-right
    else:
        final_region = (0, y_split, x_split, 10)  # top-left
else:
    # Lower region
    y_decision = f"y={query_2d[1]} < {y_split:.1f}? → go down"
    if query_2d[0] >= x_split:
        final_region = (x_split, 0, 10, y_split)  # bottom-right
    else:
        final_region = (0, 0, x_split, y_split)  # bottom-left

# Define regions for visualization
regions = [
    (0, 0, 10, 10),      # Step 1: all points
    x_region,             # Step 2: half based on x
    final_region,         # Step 3: quadrant based on y
]

# Get actual points in final region (these are the leaf candidates)
x_min, y_min, x_max, y_max = final_region
final_candidates = [pt for pt in points_2d
                    if x_min <= pt[0] <= x_max and y_min <= pt[1] <= y_max]

# Create figure
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

step_titles = [
    "Step 1: Start with all 20 points",
    f"Step 2: Split on x ≈ {x_split:.1f}",
    f"Step 3: Split on y ≈ {y_split:.1f}",
    f"Step 4: Compare {len(final_candidates)} candidates in leaf"
]
step_colors = ['steelblue', 'coral', 'gold', 'limegreen']
decisions = [x_decision, y_decision, None, None]

for i, ax in enumerate(axes):
    x_min, y_min, x_max, y_max = regions[min(i, 2)]

    # Get points in current region
    points_in_region = [(pt, x_min <= pt[0] <= x_max and y_min <= pt[1] <= y_max) for pt in points_2d]

    # Draw all points
    for pt, in_region in points_in_region:
        ax.scatter(pt[0], pt[1], s=80 if in_region else 40,
                   c='blue' if in_region else 'lightgray',
                   edgecolors='black' if in_region else 'gray',
                   zorder=5, linewidths=1)

    # Draw query point
    ax.scatter(*query_2d, s=150, c='red', marker='x', zorder=10, lw=3)

    # Draw the active region
    rect = Rectangle((x_min, y_min), x_max - x_min, y_max - y_min,
                      fill=True, facecolor=step_colors[i], alpha=0.15,
                      edgecolor=step_colors[i], lw=2, zorder=2)
    ax.add_patch(rect)

    # Draw split lines
    if i >= 1:
        ax.axvline(x_split, color='steelblue', lw=2, ls='--', alpha=0.8)
        ax.annotate(f'x={x_split:.1f}', (x_split, 9.7), ha='center', fontsize=9,
                    color='steelblue', fontweight='bold')
    if i >= 2:
        # Only draw y split line in the relevant half
        if query_2d[0] >= x_split:
            ax.axhline(y_split, xmin=x_split/10, xmax=1, color='coral', lw=2, ls='--', alpha=0.8)
        else:
            ax.axhline(y_split, xmin=0, xmax=x_split/10, color='coral', lw=2, ls='--', alpha=0.8)
        ax.annotate(f'y={y_split:.1f}', (9.7, y_split + 0.2),
                    ha='right', va='bottom', fontsize=9, color='co

Download .txt

gitextract_fn454f6u/

├── .binder/
│   └── environment.yml
├── .devcontainer/
│   ├── Dockerfile
│   ├── devcontainer.json
│   ├── scipy2023/
│   │   ├── devcontainer.json
│   │   ├── jupyter_lab_config.py
│   │   └── tasks.json
│   ├── scipy2024/
│   │   ├── devcontainer.json
│   │   ├── jupyter_lab_config.py
│   │   └── tasks.json
│   └── scipy2025/
│       ├── Dockerfile
│       └── devcontainer.json
├── .gitattributes
├── .github/
│   ├── actions/
│   │   └── setup-pixi/
│   │       └── action.yml
│   ├── dependabot.yml
│   └── workflows/
│       ├── main.yaml
│       ├── nocache.yaml
│       ├── pull_request.yaml
│       ├── qaqc.yaml
│       └── surge_preview.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierrc.toml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── _config.yml
├── _static/
│   └── style.css
├── _toc.yml
├── advanced/
│   ├── accessors/
│   │   ├── 01_accessor_examples.ipynb
│   │   └── accessors.md
│   ├── apply_ufunc/
│   │   ├── apply_ufunc.md
│   │   ├── automatic-vectorizing-numpy.ipynb
│   │   ├── complex-output-numpy.ipynb
│   │   ├── core-dimensions.ipynb
│   │   ├── dask_apply_ufunc.ipynb
│   │   ├── example-interp.ipynb
│   │   ├── numba-vectorization.ipynb
│   │   └── simple_numpy_apply_ufunc.ipynb
│   ├── backends/
│   │   ├── 1.Backend_without_Lazy_Loading.ipynb
│   │   ├── 2.Backend_with_Lazy_Loading.ipynb
│   │   └── backends.md
│   ├── indexing/
│   │   ├── indexing.md
│   │   └── why-trees.md
│   ├── map_blocks/
│   │   ├── map_blocks.md
│   │   └── simple_map_blocks.ipynb
│   └── parallel-intro.md
├── fundamentals/
│   ├── 01.1_creating_data_structures.ipynb
│   ├── 01.1_io.ipynb
│   ├── 01_data_structures.md
│   ├── 01_datastructures.ipynb
│   ├── 01_datatree_hierarchical_data.ipynb
│   ├── 02.1_indexing_Basic.ipynb
│   ├── 02.2_manipulating_dimensions.ipynb
│   ├── 02.3_aligning_data_objects.ipynb
│   ├── 02_labeled_data.md
│   ├── 03.1_computation_with_xarray.ipynb
│   ├── 03.2_groupby_with_xarray.ipynb
│   ├── 03.3_windowed.ipynb
│   ├── 03.4_weighted.ipynb
│   ├── 03_computation.md
│   ├── 04.0_plotting.md
│   ├── 04.1_basic_plotting.ipynb
│   ├── 04.2_faceting.ipynb
│   ├── 04.3_geographic_plotting.ipynb
│   ├── 05_intro_to_dask.ipynb
│   └── README.md
├── intermediate/
│   ├── BiologyDataset.ipynb
│   ├── computation/
│   │   ├── 01-high-level-computation-patterns.ipynb
│   │   ├── hierarchical_computation.ipynb
│   │   └── index.md
│   ├── data_cleaning/
│   │   ├── 05.1_intro.md
│   │   ├── 05.2_examples.md
│   │   ├── 05.3_ice_velocity.ipynb
│   │   ├── 05.4_contributing.md
│   │   ├── 05.5_scipy_talk.md
│   │   └── 05_data_cleaning.md
│   ├── datastructures-intermediate.ipynb
│   ├── hvplot.ipynb
│   ├── indexing/
│   │   ├── advanced-indexing.ipynb
│   │   ├── boolean-masking-indexing.ipynb
│   │   └── indexing.md
│   ├── intro-to-zarr.ipynb
│   ├── remote_data/
│   │   ├── cmip6-cloud.ipynb
│   │   ├── index.md
│   │   └── remote-data.ipynb
│   ├── storage_formats.ipynb
│   ├── xarray_and_dask.ipynb
│   └── xarray_ecosystem.ipynb
├── intro.md
├── overview/
│   ├── fundamental-path/
│   │   ├── README.md
│   │   └── index.ipynb
│   ├── get-started.md
│   ├── intermediate-path/
│   │   ├── README.md
│   │   └── index.ipynb
│   ├── learning-paths.md
│   └── xarray-in-45-min.ipynb
├── pyproject.toml
├── reference/
│   ├── glossary.md
│   ├── references.bib
│   └── resources.md
└── workshops/
    ├── oceanhackweek2020/
    │   └── README.md
    ├── online-tutorial-series/
    │   ├── 01_xarray_fundamentals.ipynb
    │   ├── 02_indexing.ipynb
    │   ├── 03_computation.ipynb
    │   └── README.md
    ├── scipy2023/
    │   ├── README.md
    │   └── index.ipynb
    ├── scipy2024/
    │   └── index.ipynb
    ├── scipy2025/
    │   └── index.ipynb
    └── thinking-like-xarray/
        └── README.md

Download .json

Condensed preview — 110 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (777K chars).

[
  {
    "path": ".binder/environment.yml",
    "chars": 1129,
    "preview": "name: default\nchannels:\n  - conda-forge\n  - nodefaults\ndependencies:\n  - jupyter-book >=1.0.4.post1,<2\n  - pre-commit >="
  },
  {
    "path": ".devcontainer/Dockerfile",
    "chars": 461,
    "preview": "FROM mcr.microsoft.com/devcontainers/base:noble\n\nARG PIXI_VERSION=v0.49.0\n\nRUN curl -L -o /usr/local/bin/pixi -fsSL --co"
  },
  {
    "path": ".devcontainer/devcontainer.json",
    "chars": 1231,
    "preview": "// https://pixi.sh/latest/integration/editor/vscode/#devcontainer-extension\n{\n  \"name\": \"xarray-tutorial\",\n  \"build\": {\n"
  },
  {
    "path": ".devcontainer/scipy2023/devcontainer.json",
    "chars": 671,
    "preview": "{\n  \"image\": \"quay.io/pangeo/pangeo-notebook:2023.07.05\",\n  \"postCreateCommand\": {\n    \"jupyterlab\": \"mkdir /home/jovyan"
  },
  {
    "path": ".devcontainer/scipy2023/jupyter_lab_config.py",
    "chars": 123,
    "preview": "c = get_config()  # noqa\nc.LabApp.default_url = '/lab/tree/workshops/scipy2023/index.ipynb'\nc.ServerApp.allow_origin = '"
  },
  {
    "path": ".devcontainer/scipy2023/tasks.json",
    "chars": 301,
    "preview": "{\n  \"version\": \"2.0.0\",\n  \"tasks\": [\n    {\n      \"label\": \"jupyterlab\",\n      \"type\": \"shell\",\n      \"command\": \"/srv/co"
  },
  {
    "path": ".devcontainer/scipy2024/devcontainer.json",
    "chars": 673,
    "preview": "{\n  \"image\": \"quay.io/pangeo/pangeo-notebook:2024.07.08\",\n  \"postCreateCommand\": {\n    \"jupyterlab\": \"mkdir /home/jovyan"
  },
  {
    "path": ".devcontainer/scipy2024/jupyter_lab_config.py",
    "chars": 123,
    "preview": "c = get_config()  # noqa\nc.LabApp.default_url = '/lab/tree/workshops/scipy2024/index.ipynb'\nc.ServerApp.allow_origin = '"
  },
  {
    "path": ".devcontainer/scipy2024/tasks.json",
    "chars": 301,
    "preview": "{\n  \"version\": \"2.0.0\",\n  \"tasks\": [\n    {\n      \"label\": \"jupyterlab\",\n      \"type\": \"shell\",\n      \"command\": \"/srv/co"
  },
  {
    "path": ".devcontainer/scipy2025/Dockerfile",
    "chars": 461,
    "preview": "FROM mcr.microsoft.com/devcontainers/base:noble\n\nARG PIXI_VERSION=v0.49.0\n\nRUN curl -L -o /usr/local/bin/pixi -fsSL --co"
  },
  {
    "path": ".devcontainer/scipy2025/devcontainer.json",
    "chars": 1245,
    "preview": "// https://pixi.sh/latest/integration/editor/vscode/#devcontainer-extension\n{\n  \"name\": \"scipy2025-xarray-tutorial\",\n  \""
  },
  {
    "path": ".gitattributes",
    "chars": 122,
    "preview": "# SCM syntax highlighting & preventing 3-way merges\npixi.lock merge=binary linguist-language=YAML linguist-generated=tru"
  },
  {
    "path": ".github/actions/setup-pixi/action.yml",
    "chars": 266,
    "preview": "name: \"Setup Pixi\"\ndescription: \"Create Python environment for GitHub Action Job\"\n\nruns:\n  using: \"composite\"\n  steps:\n "
  },
  {
    "path": ".github/dependabot.yml",
    "chars": 175,
    "preview": "# Regularly update Docker tags and Actions steps\nversion: 2\nupdates:\n  - package-ecosystem: \"github-actions\"\n    directo"
  },
  {
    "path": ".github/workflows/main.yaml",
    "chars": 1737,
    "preview": "name: Deploy Website to GitHub Pages\n\non:\n  push:\n    branches: main\n    paths-ignore:\n      - \".devcontainer/**\"\n\n# Set"
  },
  {
    "path": ".github/workflows/nocache.yaml",
    "chars": 1012,
    "preview": "name: Rebuild Entire Jupyter Book on all Platforms\n\non:\n  workflow_dispatch:\n\n# Allow one concurrent deployment\nconcurre"
  },
  {
    "path": ".github/workflows/pull_request.yaml",
    "chars": 1220,
    "preview": "name: Pull Request Build\n\non:\n  pull_request:\n    types: [opened, synchronize, reopened, closed]\n    paths-ignore:\n     "
  },
  {
    "path": ".github/workflows/qaqc.yaml",
    "chars": 1406,
    "preview": "name: QualityContol\n\non:\n  workflow_dispatch:\n  pull_request:\n    branches:\n      - main\n    paths-ignore:\n      - \".dev"
  },
  {
    "path": ".github/workflows/surge_preview.yml",
    "chars": 1268,
    "preview": "name: Pull Request Preview\n\non:\n  workflow_run:\n    workflows: [\"Pull Request Build\"]\n    types:\n      - completed\n\nperm"
  },
  {
    "path": ".gitignore",
    "chars": 1464,
    "preview": "# project/repo specific\nconf.py\nadvanced/backends/*.bin\nscipy-tutorial/dask-report-large-chunk.html\nmydask.png\ndask-repo"
  },
  {
    "path": ".pre-commit-config.yaml",
    "chars": 1136,
    "preview": "ci:\n  autoupdate_schedule: monthly\n\nrepos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    "
  },
  {
    "path": ".prettierrc.toml",
    "chars": 45,
    "preview": "tabWidth = 2\nsemi = false\nsingleQuote = true\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 3745,
    "preview": "# Contributing Guide\n\nThis tutorial repository is a great opportunity to start contributing to Xarray.\n\n- Report bugs, r"
  },
  {
    "path": "LICENSE",
    "chars": 11359,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 3550,
    "preview": "# Xarray Tutorial\n\n[![Deploy Website to GitHub Pages](https://github.com/xarray-contrib/xarray-tutorial/actions/workflow"
  },
  {
    "path": "_config.yml",
    "chars": 4188,
    "preview": "# Learn more at https://jupyterbook.org/customize/config.html\ntitle: \"\"\nauthor: The Xarray Community\ncopyright: \"2025\"\nl"
  },
  {
    "path": "_static/style.css",
    "chars": 473,
    "preview": ".bd-header-announcement {\n  background-color: var(--pst-color-accent);\n}\n\n/* workaround Pydata Sphinx theme using light "
  },
  {
    "path": "_toc.yml",
    "chars": 4818,
    "preview": "# Learn more at https://jupyterbook.org/customize/toc.html\nroot: intro\nformat: jb-book\nparts:\n  - caption: Overview\n    "
  },
  {
    "path": "advanced/accessors/01_accessor_examples.ipynb",
    "chars": 16454,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Creating custom accessors\"\n   ]\n "
  },
  {
    "path": "advanced/accessors/accessors.md",
    "chars": 26,
    "preview": "```{tableofcontents}\n\n```\n"
  },
  {
    "path": "advanced/apply_ufunc/apply_ufunc.md",
    "chars": 41,
    "preview": "# apply_ufunc\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "advanced/apply_ufunc/automatic-vectorizing-numpy.ipynb",
    "chars": 11932,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"(v"
  },
  {
    "path": "advanced/apply_ufunc/complex-output-numpy.ipynb",
    "chars": 11099,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"(c"
  },
  {
    "path": "advanced/apply_ufunc/core-dimensions.ipynb",
    "chars": 9464,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# "
  },
  {
    "path": "advanced/apply_ufunc/dask_apply_ufunc.ipynb",
    "chars": 27736,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# "
  },
  {
    "path": "advanced/apply_ufunc/example-interp.ipynb",
    "chars": 27977,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# np.interp : An"
  },
  {
    "path": "advanced/apply_ufunc/numba-vectorization.ipynb",
    "chars": 7696,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# "
  },
  {
    "path": "advanced/apply_ufunc/simple_numpy_apply_ufunc.ipynb",
    "chars": 10569,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"(g"
  },
  {
    "path": "advanced/backends/1.Backend_without_Lazy_Loading.ipynb",
    "chars": 4044,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Binary data without lazy loading\\"
  },
  {
    "path": "advanced/backends/2.Backend_with_Lazy_Loading.ipynb",
    "chars": 5905,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Binary data with lazy loading\\n\","
  },
  {
    "path": "advanced/backends/backends.md",
    "chars": 2005,
    "preview": "# Reading data using backends\n\n## Introduction\n\nYou can [read different types of files](https://docs.xarray.dev/en/stabl"
  },
  {
    "path": "advanced/indexing/indexing.md",
    "chars": 38,
    "preview": "# Indexing\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "advanced/indexing/why-trees.md",
    "chars": 25219,
    "preview": "---\njupytext:\n  formats: ipynb,md:myst\n  text_representation:\n    extension: .md\n    format_name: myst\n    format_versio"
  },
  {
    "path": "advanced/map_blocks/map_blocks.md",
    "chars": 40,
    "preview": "# map_blocks\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "advanced/map_blocks/simple_map_blocks.ipynb",
    "chars": 7860,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# "
  },
  {
    "path": "advanced/parallel-intro.md",
    "chars": 1721,
    "preview": "# Parallelizing custom functions\n\nAlmost all of xarray’s built-in operations work on Dask arrays.\n\nSometimes analysis ca"
  },
  {
    "path": "fundamentals/01.1_creating_data_structures.ipynb",
    "chars": 15094,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Creating Data Struc"
  },
  {
    "path": "fundamentals/01.1_io.ipynb",
    "chars": 8248,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Reading and writing"
  },
  {
    "path": "fundamentals/01_data_structures.md",
    "chars": 3575,
    "preview": "# Data Structures\n\nMulti-dimensional (a.k.a. N-dimensional, ND) arrays (sometimes called “tensors”)\nare an essential par"
  },
  {
    "path": "fundamentals/01_datastructures.ipynb",
    "chars": 13254,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Xarray's Data structures\\n\",\n    "
  },
  {
    "path": "fundamentals/01_datatree_hierarchical_data.ipynb",
    "chars": 5224,
    "preview": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# `xarray.Dat"
  },
  {
    "path": "fundamentals/02.1_indexing_Basic.ipynb",
    "chars": 20781,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Indexing and Selecting Data\\n\",\n "
  },
  {
    "path": "fundamentals/02.2_manipulating_dimensions.ipynb",
    "chars": 2648,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Manipulating Dimensions (Data Res"
  },
  {
    "path": "fundamentals/02.3_aligning_data_objects.ipynb",
    "chars": 16609,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Computing with Multiple Objects\\n"
  },
  {
    "path": "fundamentals/02_labeled_data.md",
    "chars": 42,
    "preview": "# Labeled data\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "fundamentals/03.1_computation_with_xarray.ipynb",
    "chars": 7919,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"(fundamentals/basic-computation)=\\n"
  },
  {
    "path": "fundamentals/03.2_groupby_with_xarray.ipynb",
    "chars": 17396,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Grouped Computations\\n\",\n    \"\\n\""
  },
  {
    "path": "fundamentals/03.3_windowed.ipynb",
    "chars": 11480,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Windowed Computatio"
  },
  {
    "path": "fundamentals/03.4_weighted.ipynb",
    "chars": 4412,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"tags\": []\n   },\n   \"source\": [\n    \"# "
  },
  {
    "path": "fundamentals/03_computation.md",
    "chars": 41,
    "preview": "# Computation\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "fundamentals/04.0_plotting.md",
    "chars": 56,
    "preview": "# Plotting and Visualization\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "fundamentals/04.1_basic_plotting.ipynb",
    "chars": 10488,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {\n    \"toc\": true\n   },\n   \"source\": [\n    \"# Basic Visuali"
  },
  {
    "path": "fundamentals/04.2_faceting.ipynb",
    "chars": 6701,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Faceting\\n\",\n    \"\\"
  },
  {
    "path": "fundamentals/04.3_geographic_plotting.ipynb",
    "chars": 3341,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Geography with Cart"
  },
  {
    "path": "fundamentals/05_intro_to_dask.ipynb",
    "chars": 16573,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Introduction to Dask\\n\",\n    \"\\n\""
  },
  {
    "path": "fundamentals/README.md",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "intermediate/BiologyDataset.ipynb",
    "chars": 2839,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Datasets in other f"
  },
  {
    "path": "intermediate/computation/01-high-level-computation-patterns.ipynb",
    "chars": 43352,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {\n    \"slideshow\": {\n     \"slide_type\": \"slid"
  },
  {
    "path": "intermediate/computation/hierarchical_computation.ipynb",
    "chars": 7896,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Hierarchical comput"
  },
  {
    "path": "intermediate/computation/index.md",
    "chars": 42,
    "preview": "# Computations\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "intermediate/data_cleaning/05.1_intro.md",
    "chars": 5929,
    "preview": "# Data Tidying\n\nArray data that are represented by Xarray objects are often multivariate, multi-dimensional, and very co"
  },
  {
    "path": "intermediate/data_cleaning/05.2_examples.md",
    "chars": 1339,
    "preview": "# Examples\n\nThis page contains examples of 'tidying' datasets. If you have an example you'd like to submit, or an exampl"
  },
  {
    "path": "intermediate/data_cleaning/05.3_ice_velocity.ipynb",
    "chars": 13817,
    "preview": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    "
  },
  {
    "path": "intermediate/data_cleaning/05.4_contributing.md",
    "chars": 696,
    "preview": "# Contributing\n\nThis project is an evolving community effort. **We want to hear from you!**. Many workflows involve some"
  },
  {
    "path": "intermediate/data_cleaning/05.5_scipy_talk.md",
    "chars": 528,
    "preview": "# Presentations\n\n## SciPy 2023\n\nThis project was initially presented at the 2023 SciPy conference in Austin, TX. You can"
  },
  {
    "path": "intermediate/data_cleaning/05_data_cleaning.md",
    "chars": 42,
    "preview": "# Data Tidying\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "intermediate/datastructures-intermediate.ipynb",
    "chars": 17269,
    "preview": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Intermediat"
  },
  {
    "path": "intermediate/hvplot.ipynb",
    "chars": 3953,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Interactive plots u"
  },
  {
    "path": "intermediate/indexing/advanced-indexing.ipynb",
    "chars": 14443,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Advanced Indexing\\n\",\n    \"\\n\",\n "
  },
  {
    "path": "intermediate/indexing/boolean-masking-indexing.ipynb",
    "chars": 14127,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Boolean Indexing & Masking\\n\",\n  "
  },
  {
    "path": "intermediate/indexing/indexing.md",
    "chars": 38,
    "preview": "# Indexing\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "intermediate/intro-to-zarr.ipynb",
    "chars": 20527,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Introduction to Zar"
  },
  {
    "path": "intermediate/remote_data/cmip6-cloud.ipynb",
    "chars": 8043,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"(cmip6-cloud)=\\n\",\n  "
  },
  {
    "path": "intermediate/remote_data/index.md",
    "chars": 41,
    "preview": "# Remote Data\n\n```{tableofcontents}\n\n```\n"
  },
  {
    "path": "intermediate/remote_data/remote-data.ipynb",
    "chars": 17198,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Access Patterns to "
  },
  {
    "path": "intermediate/storage_formats.ipynb",
    "chars": 10708,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Hierarchical storage formats\\n\",\n"
  },
  {
    "path": "intermediate/xarray_and_dask.ipynb",
    "chars": 16791,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Parallel computing with Dask\\n\",\n"
  },
  {
    "path": "intermediate/xarray_ecosystem.ipynb",
    "chars": 20015,
    "preview": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# A Tour of X"
  },
  {
    "path": "intro.md",
    "chars": 1691,
    "preview": "# Welcome to the Xarray Tutorial!\n\n**[`Xarray`](https://xarray.dev) is an open source project and Python package that ma"
  },
  {
    "path": "overview/fundamental-path/README.md",
    "chars": 1510,
    "preview": "(fundamental-path)=\n\n# Fundamental Path\n\nThis syllabus comes from Scipy 2022 tutorial workshop.\nThis path covers fundame"
  },
  {
    "path": "overview/fundamental-path/index.ipynb",
    "chars": 2796,
    "preview": "{\n \"cells\": [\n  {\n   \"attachments\": {},\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    "
  },
  {
    "path": "overview/get-started.md",
    "chars": 4044,
    "preview": "<img src=\"https://docs.xarray.dev/en/stable/_static/Xarray_Logo_RGB_Final.svg\" align=\"right\" width=\"30%\">\n\n(get-started)"
  },
  {
    "path": "overview/intermediate-path/README.md",
    "chars": 1190,
    "preview": "# Intermediate Path\n\nThis learning path was presented at SciPy 2023. It covers intermediate\nand more advanced topics and"
  },
  {
    "path": "overview/intermediate-path/index.ipynb",
    "chars": 2621,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"<img src=\\\"../../imag"
  },
  {
    "path": "overview/learning-paths.md",
    "chars": 327,
    "preview": "# Learning Paths\n\nLearning Paths are guided tours through the material on the site.\nThese paths were originally built fo"
  },
  {
    "path": "overview/xarray-in-45-min.ipynb",
    "chars": 35954,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"<img src=\\\"https://docs.xarray.dev/"
  },
  {
    "path": "pyproject.toml",
    "chars": 3159,
    "preview": "[project]\nname = \"xarray-tutorial\"\ndescription = \"Xarray Tutorial Website\"\nreadme = \"README.md\"\nlicense.file = \"LICENSE\""
  },
  {
    "path": "reference/glossary.md",
    "chars": 1016,
    "preview": "# Glossary\n\nFor Xarray data structure terminology see https://docs.xarray.dev/en/stable/user-guide/terminology.html\n\n```"
  },
  {
    "path": "reference/references.bib",
    "chars": 645,
    "preview": "@article{hoyerhamman2017,\n  title     = {xarray: {N-D} labeled arrays and datasets in {Python}},\n  author    = {Hoyer, S"
  },
  {
    "path": "reference/resources.md",
    "chars": 2101,
    "preview": "# Keep Exploring!\n\nTo help you go deeper, we've also create a list of notebooks that\ndemonstrate real-world applications"
  },
  {
    "path": "workshops/oceanhackweek2020/README.md",
    "chars": 503,
    "preview": "# Oceanhackweek 2020\n\nPresented August 2020 at [OceanHackWeek](https://oceanhackweek.github.io) by Deepak Cheerian\n\nThis"
  },
  {
    "path": "workshops/online-tutorial-series/01_xarray_fundamentals.ipynb",
    "chars": 12156,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Xarray Fundamentals\\n\"\n   ]\n  },\n"
  },
  {
    "path": "workshops/online-tutorial-series/02_indexing.ipynb",
    "chars": 7308,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Indexing and Selecting Data\\n\"\n  "
  },
  {
    "path": "workshops/online-tutorial-series/03_computation.ipynb",
    "chars": 7540,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Computation\\n\"\n   ]\n  },\n  {\n   \""
  },
  {
    "path": "workshops/online-tutorial-series/README.md",
    "chars": 537,
    "preview": "# Xarray Online Tutorial 2020\n\nPresented October 6 2020 by:\n\n- Anderson Banihirwe (NCAR)\n- Deepak Cherian (NCAR)\n- Marti"
  },
  {
    "path": "workshops/scipy2023/README.md",
    "chars": 3120,
    "preview": "# SciPy 2023\n\n## Xarray: Friendly, Interactive, and Scalable Scientific Data Analysis\n\nOrganized by:\n\n- Deepak Cherian ("
  },
  {
    "path": "workshops/scipy2023/index.ipynb",
    "chars": 3016,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"<img src=\\\"../../imag"
  },
  {
    "path": "workshops/scipy2024/index.ipynb",
    "chars": 5493,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# SciPy 2024\\n\",\n    "
  },
  {
    "path": "workshops/scipy2025/index.ipynb",
    "chars": 3105,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"0\",\n   \"metadata\": {},\n   \"source\": [\n    \"# SciPy 2025\\n\",\n    "
  },
  {
    "path": "workshops/thinking-like-xarray/README.md",
    "chars": 443,
    "preview": "# Thinking like Xarray 2022\n\nPresented March 2022 for the [NCAR Python Seminar Series](https://ncar.github.io/esds/posts"
  }
]

About this extraction

This page contains the full source code of the xarray-contrib/xarray-tutorial GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 110 files (682.0 KB), approximately 202.9k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo