Repository: ToucanToco/fastexcel
Branch: main
Commit: 98bf33293c85
Files: 99
Total size: 46.9 MB

Directory structure:
gitextract_ze2pys5u/

├── .clippy.toml
├── .github/
│   ├── dependabot.yml
│   └── workflows/
│       ├── CI.yml
│       ├── docs.yml
│       └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── doc-templates/
│   └── module.html.jinja2
├── pyproject.toml
├── python/
│   ├── fastexcel/
│   │   ├── __init__.py
│   │   ├── _fastexcel.pyi
│   │   └── py.typed
│   └── tests/
│       ├── __init__.py
│       ├── benchmarks/
│       │   ├── README.md
│       │   ├── fixtures/
│       │   │   ├── formulas.xlsx
│       │   │   ├── plain_data.xls
│       │   │   └── plain_data.xlsx
│       │   ├── memory.py
│       │   ├── readers.py
│       │   └── speed.py
│       ├── conftest.py
│       ├── test_alias_generation.py
│       ├── test_column_selection.py
│       ├── test_defined_names.py
│       ├── test_dtypes.py
│       ├── test_durations.py
│       ├── test_eagerness.py
│       ├── test_empty.py
│       ├── test_errors.py
│       ├── test_fastexcel.py
│       ├── test_pycapsule.py
│       ├── test_sheet_visibility.py
│       ├── test_shifted_data.py
│       ├── test_tables.py
│       ├── test_whitespace.py
│       └── utils.py
├── scripts/
│   └── update_versions.py
├── src/
│   ├── data/
│   │   ├── cell_extractors.rs
│   │   ├── mod.rs
│   │   ├── python.rs
│   │   └── rust.rs
│   ├── error.rs
│   ├── lib.rs
│   ├── types/
│   │   ├── dtype/
│   │   │   ├── mod.rs
│   │   │   └── python.rs
│   │   ├── excelreader/
│   │   │   ├── mod.rs
│   │   │   └── python.rs
│   │   ├── excelsheet/
│   │   │   ├── column_info/
│   │   │   │   ├── mod.rs
│   │   │   │   └── python.rs
│   │   │   ├── mod.rs
│   │   │   ├── polars.rs
│   │   │   ├── python.rs
│   │   │   └── table.rs
│   │   ├── exceltable/
│   │   │   ├── mod.rs
│   │   │   └── python.rs
│   │   ├── idx_or_name/
│   │   │   ├── mod.rs
│   │   │   └── python.rs
│   │   └── mod.rs
│   └── utils/
│       ├── mod.rs
│       └── schema.rs
├── test.py
└── tests/
    ├── column_selection.rs
    ├── fastexcel.rs
    ├── fixtures/
    │   ├── dates.ods
    │   ├── decimal-numbers.xlsx
    │   ├── div0.xlsx
    │   ├── empty.ods
    │   ├── empty.xlsx
    │   ├── fixture-changing-header-location.xlsx
    │   ├── fixture-invalid-cell-value-num.xlsx
    │   ├── fixture-invalid-cell-value.xlsx
    │   ├── fixture-multi-dtypes-columns.xlsx
    │   ├── fixture-multi-sheet.xlsx
    │   ├── fixture-sheets-different-visibilities.xlsx
    │   ├── fixture-single-sheet-duplicated-columns.xlsx
    │   ├── fixture-single-sheet-with-types.xlsx
    │   ├── fixture-single-sheet.xlsx
    │   ├── fixture-type-errors.xlsx
    │   ├── infer-dtypes-fallback.xlsx
    │   ├── no-header.xlsx
    │   ├── null-bytes-in-columns-names.xls
    │   ├── null-column.xlsx
    │   ├── sheet-and-table-with-offset.xlsx
    │   ├── sheet-and-table-with-whitespace.xlsx
    │   ├── sheet-null-strings-empty.xlsx
    │   ├── sheet-null-strings.xlsx
    │   ├── sheet-with-defined-names.xlsx
    │   ├── sheet-with-na.xlsx
    │   ├── sheet-with-tables.xlsx
    │   └── single-sheet-skip-rows-durations.xlsx
    ├── sheet_visibility.rs
    ├── shifted_data.rs
    ├── tables.rs
    ├── utils/
    │   └── mod.rs
    └── whitespace.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .clippy.toml
================================================
disallowed-macros = [
  { path = "std::assert_ne", reason = "use `pretty_assertions::assert_ne` instead" },
  { path = "std::assert_eq", reason = "use `pretty_assertions::assert_eq` instead" },
  { path = "std::assert_matches", reason = "use `pretty_assertions::assert_matches` instead" },
]


================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  # python
  - package-ecosystem: "pip"
    directory: "/"
    schedule:
      interval: "daily"
    labels:
      - "dependencies"
      - ":snake: python :snake:"
  # rust
  - package-ecosystem: "cargo"
    directory: "/"
    schedule:
      interval: "daily"
    groups:
      prod-deps:
        dependency-type: "production"
      dev-deps:
        dependency-type: "development"
    labels:
      - "dependencies"
      - ":crab: rust :crab:"
  # actions
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "daily"


================================================
FILE: .github/workflows/CI.yml
================================================
name: CI

on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, synchronize, reopened]

env:
  MIN_PYTHON_VERSION: "3.10"

defaults:
  run:
    # Prevents windows runners from running on powershell
    shell: bash

jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "${{ env.MIN_PYTHON_VERSION }}"
      - name: Set up rust toolchain
        uses: dtolnay/rust-toolchain@stable
        with:
          components: rustfmt, clippy
      - name: Set up rustfmt
        run: rustup component add rustfmt

      - name: install uv
        uses: astral-sh/setup-uv@v7

      - name: Install dependencies and lint
        run: |
          make install
          make lint

  check-docs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
      - name: Set up rust toolchain
        uses: dtolnay/rust-toolchain@stable
      - name: install uv
        uses: astral-sh/setup-uv@v7
      - name: Check documentation
        run: |
          make install
          make doc

  test:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"]
        os:
          - "ubuntu-latest"
          - "ubuntu-24.04-arm"
          - "macos-14"
          - "windows-latest"
          # windows-11-arm excluded: pyarrow is not available for Windows ARM64
          # https://github.com/apache/arrow/issues/47195
    steps:
      - uses: actions/checkout@v6
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
      - name: Set up rust toolchain
        uses: dtolnay/rust-toolchain@stable

      - name: install uv
        uses: astral-sh/setup-uv@v7

      - name: Install dependencies and test
        run: |
          make install
          make test

      - name: Test with pandas<3
        run: |
          uv pip install "pandas<3"
          make test-python

  check-wheel-build:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        # Only testing the build on the smallest supported Python version for abi3 wheels
        python-version: ["3.10", "3.14t"]
        os: ["ubuntu-latest", "macos-14", "windows-latest"]
        architecture: [x86-64, aarch64]
        exclude:
          # Cross-compiling x86_64 → aarch64 on Windows doesn't work; use windows-11-arm instead
          - os: windows-latest
            architecture: aarch64
        include:
          # Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available)
          - os: windows-11-arm
            python-version: "3.11"
            architecture: aarch64
          # TODO: re-enable once setup-python supports windows-11-arm + python 3.14t
          # (setup-python is currently broken with that combination)
          # - os: windows-11-arm
          #   python-version: "3.14t"
          #   architecture: aarch64
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
      - name: Set Rust target
        id: target
        run: |
          TARGET=${{
            (matrix.os == 'macos-14' && (matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin'))
            || (matrix.os == 'ubuntu-latest' && (matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || 'x86_64-unknown-linux-gnu'))
            || (matrix.os == 'windows-latest' && 'x86_64-pc-windows-msvc')
            || (matrix.os == 'windows-11-arm' && 'aarch64-pc-windows-msvc')
          }}
          echo "target=$TARGET" >> $GITHUB_OUTPUT
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
      - name: build (fast)
        uses: PyO3/maturin-action@v1
        with:
          manylinux: auto
          command: build
          args: "-o dist -i python${{ matrix.python-version }}"
          target: ${{ steps.target.outputs.target }}

      - name: Upload wheels
        uses: actions/upload-artifact@v7
        with:
          name: "wheels-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
          path: dist

  check-wheel-build-musllinux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.14t"]
        architecture: [x86-64, aarch64]
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
      - name: build (fast)
        uses: PyO3/maturin-action@v1
        with:
          manylinux: musllinux_1_2
          command: build
          args: "-o dist -i python${{ matrix.python-version }}"
          target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }}
      - name: Upload wheels
        uses: actions/upload-artifact@v7
        with:
          name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
          path: dist

  check-sdist-build:
    runs-on: "ubuntu-latest"
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
      - name: build sdist
        uses: PyO3/maturin-action@v1
        with:
          manylinux: auto
          command: sdist
          args: "-o dist"
      - name: upload sdist
        uses: actions/upload-artifact@v7
        with:
          name: sdist
          path: dist


================================================
FILE: .github/workflows/docs.yml
================================================
name: Docs

on:
  push:
    branches:
      - main
    tags:
      - 'v*'
  workflow_dispatch:
    inputs:
      version_tag:
        description: 'Tag to build docs for (e.g. v0.18.0). Checks out the tag before building.'
        required: true
      mark_as_stable:
        description: 'Mark this version as the stable default (updates root redirect)'
        type: boolean
        default: false

jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Checkout tag (workflow_dispatch)
        if: github.event_name == 'workflow_dispatch'
        env:
          VERSION_TAG: ${{ github.event.inputs.version_tag }}
        run: git checkout "refs/tags/$VERSION_TAG"

      - name: Set up Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"

      - name: Set up rust toolchain
        uses: dtolnay/rust-toolchain@stable

      - name: install uv
        uses: astral-sh/setup-uv@v7

      - name: Determine version
        id: version
        env:
          INPUT_VERSION_TAG: ${{ github.event.inputs.version_tag }}
          INPUT_MARK_AS_STABLE: ${{ github.event.inputs.mark_as_stable }}
        run: |
          if [[ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]]; then
            echo "version=$INPUT_VERSION_TAG" >> "$GITHUB_OUTPUT"
            echo "is_stable=$INPUT_MARK_AS_STABLE" >> "$GITHUB_OUTPUT"
          elif [[ "${GITHUB_REF}" == refs/tags/v* ]]; then
            echo "version=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT"
            echo "is_stable=true" >> "$GITHUB_OUTPUT"
          else
            echo "version=latest" >> "$GITHUB_OUTPUT"
            echo "is_stable=false" >> "$GITHUB_OUTPUT"
          fi

      - name: Build docs
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          make install
          make doc-versioned

      - name: Deploy to gh-pages
        env:
          VERSION: ${{ steps.version.outputs.version }}
          IS_STABLE: ${{ steps.version.outputs.is_stable }}
        run: |
          git config user.name github-actions
          git config user.email github-actions@github.com

          # Stash built docs
          cp -r "docs/$VERSION" /tmp/docs-build

          # Switch to gh-pages (gh-pages exists)
          git checkout gh-pages
          git merge -m 'Merge main' origin/main

          # Place versioned docs
          rm -rf "docs/$VERSION"
          mv /tmp/docs-build "docs/$VERSION"

          # Update versions.json and root redirect
          STABLE_FLAG=""
          if [[ "$IS_STABLE" == "true" ]]; then
            STABLE_FLAG="--stable"
          fi
          ./scripts/update_versions.py \
            --version "$VERSION" \
            --docs-dir docs \
            $STABLE_FLAG

          git add -f docs
          git commit -m "Update docs ($VERSION)" --allow-empty
          git push origin gh-pages


================================================
FILE: .github/workflows/release.yml
================================================
name: Release

on:
  push:
    # Sequence of patterns matched against refs/tags
    tags:
    - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10

jobs:
  linux:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.14t"]
        architecture: [x86-64, aarch64]
    steps:
    - uses: actions/checkout@v6
    - uses: dtolnay/rust-toolchain@stable
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
    - name: build (release)
      uses: PyO3/maturin-action@v1
      with:
        manylinux: auto
        command: build
        args: "--release -o dist -i python${{ matrix.python-version }}"
        target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || null }}
    - name: Upload wheels
      uses: actions/upload-artifact@v7
      with:
        name: "wheels-linux-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
        path: dist

  linux-musl:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.14t"]
        architecture: [x86-64, aarch64]
    steps:
    - uses: actions/checkout@v6
    - uses: dtolnay/rust-toolchain@stable
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
    - name: build (release)
      uses: PyO3/maturin-action@v1
      with:
        manylinux: musllinux_1_2
        command: build
        args: "--release -o dist -i python${{ matrix.python-version }}"
        target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }}
    - name: Upload wheels
      uses: actions/upload-artifact@v7
      with:
        name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
        path: dist

  macos:
    runs-on: macos-14
    strategy:
      matrix:
        python-version: ["3.10", "3.14t"]
        architecture: [x86-64, aarch64]
    steps:
    - uses: actions/checkout@v6
    - uses: dtolnay/rust-toolchain@stable
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
    - name: build (release)
      uses: PyO3/maturin-action@v1
      with:
        command: build
        args: "--release -o dist -i python${{ matrix.python-version }}"
        target: ${{ matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin' }}
    - name: Upload wheels
      uses: actions/upload-artifact@v7
      with:
        name: "wheels-macos-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
        path: dist

  windows:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        python-version: ["3.10", "3.14t"]
        os: [windows-latest]
        architecture: [x86-64]
        include:
          # Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available)
          - os: windows-11-arm
            python-version: "3.11"
            architecture: aarch64
          # TODO: re-enable once setup-python supports windows-11-arm + python 3.14t
          # (setup-python is currently broken with that combination)
          # - os: windows-11-arm
          #   python-version: "3.14t"
          #   architecture: aarch64
    steps:
    - uses: actions/checkout@v6
    - uses: dtolnay/rust-toolchain@stable
    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
    - name: build (release)
      uses: PyO3/maturin-action@v1
      with:
        command: build
        args: "--release -o dist -i python${{ matrix.python-version }}"
        target: ${{ matrix.architecture == 'aarch64' && 'aarch64-pc-windows-msvc' || 'x86_64-pc-windows-msvc' }}
    - name: Upload wheels
      uses: actions/upload-artifact@v7
      with:
        name: "wheels-windows-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
        path: dist

  sdist:
    runs-on: "ubuntu-latest"
    steps:
      - uses: actions/checkout@v6
      - uses: dtolnay/rust-toolchain@stable
      - uses: actions/setup-python@v6
        with:
          python-version: "3.10"
      - name: build (sdist)
        uses: PyO3/maturin-action@v1
        with:
          manylinux: auto
          command: sdist
          args: "-o dist"
      - name: Upload sdist
        uses: actions/upload-artifact@v7
        with:
          name: sdist
          path: dist


  # NOTE: Cannot use a matrix here, as we only want a single release
  release:
    name: Release
    runs-on: ubuntu-latest
    needs: [linux, linux-musl, macos, windows, sdist]
    permissions:
      id-token: write     # Required for OIDC token exchange with crates.io
      contents: write     # Required to be able to create a GitHub release
    steps:
    - uses: actions/checkout@v6
    - uses: dtolnay/rust-toolchain@stable
    - uses: rust-lang/crates-io-auth-action@v1
      id: auth

    - name: Download Linux wheels
      uses: actions/download-artifact@v8
      with:
        pattern: "wheels-linux-*"
        merge-multiple: true
        path: wheels-linux

    - name: Download MacOS wheels
      uses: actions/download-artifact@v8
      with:
        pattern: "wheels-macos-*"
        merge-multiple: true
        path: wheels-macos

    - name: Download Windows wheels
      uses: actions/download-artifact@v8
      with:
        pattern: "wheels-windows-*"
        merge-multiple: true
        path: wheels-windows

    - name: Download sdist
      uses: actions/download-artifact@v8
      with:
        name: "sdist"
        path: sdist

    - name: Publish to PyPI
      uses: PyO3/maturin-action@v1
      env:
        MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
      with:
        command: upload
        args: "--skip-existing wheels-linux/*.whl wheels-macos/*.whl wheels-windows/*.whl sdist/*.tar.gz"

    - name: Publish to crates.io
      run: cargo publish
      env:
        CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}

    - name: Release
      uses: softprops/action-gh-release@v3
      with:
        generate_release_notes: true
        files: |
          wheels-linux/*.whl
          wheels-macos/*.whl
          wheels-windows/*.whl
          sdist/*.tar.gz


================================================
FILE: .gitignore
================================================
/target

bigfile.*
__pycache__
*.pyc
*.so
*.dat
.DS_Store

.python-version
pyrightconfig.json
.venv
docs
.vscode
.idea
.benchmarks
notebooks
/python/tests/fixtures/~$*.xlsx
.zed
dist


================================================
FILE: .pre-commit-config.yaml
================================================
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v3.2.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
-   repo: http://github.com/doublify/pre-commit-rust
    rev: v1.0
    hooks:
    -   id: cargo-check
-   repo: local
    hooks:
    -   id: lint
        name: Lint
        entry: make lint
        types_or: [python, rust]
        language: system
        pass_filenames: false
    -   id: format
        name: Format
        entry: make format
        types_or: [python, rust]
        language: system
        pass_filenames: false


================================================
FILE: Cargo.toml
================================================
[package]
name = "fastexcel"
version = "0.20.2"
description = "A fast excel reader for Rust and Python"
rust-version = "1.85.0"
edition = "2024"
license = "MIT"
homepage = "https://github.com/ToucanToco/fastexcel"
repository = "https://github.com/ToucanToco/fastexcel.git"
readme = "README.md"
include = [
    "/pyproject.toml",
    "/README.md",
    "/LICENSE",
    "/Makefile",
    "/src",
    "/python/fastexcel",
    "!__pycache__",
    "!*.pyc",
    "!*.so",
]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "fastexcel"
crate-type = ["cdylib", "rlib"]

[dependencies]
arrow-array = { version = "^58", features = ["ffi"], optional = true }
arrow-pyarrow = { version = "^58", optional = true }
arrow-schema = { version = "^58", optional = true }
calamine = { version = "^0.35.0", features = ["chrono"] }
chrono = { version = "^0.4.40", default-features = false }
log = "^0.4"
polars-core = { version = ">=0.53", features = [
    "dtype-date",
    "dtype-datetime",
    "dtype-duration",
], optional = true }
pyo3 = { version = "^0.28", features = ["abi3-py310"], optional = true }
pyo3-arrow = { version = "^0.17", default-features = false, optional = true }
pyo3-log = { version = "^0.13.3", optional = true }

[dev-dependencies]
anyhow = "1.0.102"
pretty_assertions = { version = "^1.4.1", features = ["unstable"] }
rstest = { version = "^0.26.1", default-features = false }

# NOTE: This is a hack to bypass pyo3 limitations when testing:
# https://pyo3.rs/v0.22.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror
[features]
default = []
__arrow = ["dep:arrow-schema", "dep:arrow-array"]
python = ["__arrow", "dep:pyo3", "dep:pyo3-log", "dep:pyo3-arrow"]
extension-module = ["pyo3/extension-module"]
polars = ["dep:polars-core"]
pyarrow = ["dep:arrow-pyarrow", "python"]
# Private features for internal usage, should not be used directly as they may
# change without notice
__pyo3-tests = [
    # feature for tests only. This makes Python::with_gil auto-initialize Python
    # interpreters, which allows us to instantiate Python objects in tests
    # (see https://pyo3.rs/v0.22.3/features#auto-initialize)
    "pyo3/auto-initialize",
    "pyarrow",
]
__rust-tests-standalone = []
__rust-tests-polars = ["polars"]
# Private feature for maturin usage, should not be used directly
__maturin = ["extension-module", "pyarrow"]


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 ToucanToco

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Makefile
================================================
.DEFAULT_GOAL := all
sources = python/fastexcel python/tests

export CARGO_TERM_COLOR=$(shell (test -t 0 && echo always) || echo auto)

.PHONY: .uv  ## Check that uv is installed
.uv:
	@uv -V || echo 'Please install uv: https://docs.astral.sh/uv/getting-started/installation/'

.PHONY: install  ## Install the package & dependencies with debug build
install: .uv
	uv sync --frozen --group all
	uv run maturin develop --uv -E pyarrow,pandas,polars

.PHONY: install-prod  ## Install the package & dependencies with release build
install-prod: .uv
	uv sync --frozen --group all
	uv run maturin develop --uv --release -E pyarrow,pandas,polars

.PHONY: setup-dev  ## First-time setup: install + pre-commit hooks
setup-dev: install
	uv run pre-commit install --install-hooks

.PHONY: rebuild-lockfiles  ## Rebuild lockfiles from scratch, updating all dependencies
rebuild-lockfiles: .uv
	uv lock --upgrade
	cargo update

.PHONY: build-dev  ## Build the development version of the package
build-dev:
	uv run maturin build

.PHONY: build-wheel  ## Build production wheel and install it
build-wheel:
	@rm -rf target/wheels/
	uv run maturin build --release
	@wheel=$$(ls target/wheels/*.whl); uv pip install --force-reinstall "$$wheel[pandas,polars]"

.PHONY: lint-python  ## Lint python source files
lint-python:
	uv run ruff check $(sources)
	uv run ruff format --check $(sources)
	uv run mypy $(sources)

.PHONY: lint-rust  ## Lint rust source files
lint-rust:
	cargo fmt --all -- --check
	# Rust
	cargo clippy --tests -- -D warnings
	# Python-related code
	cargo clippy --features __maturin,__pyo3-tests --tests -- -D warnings
	# Rust+polars
	cargo clippy --features polars --tests -- -D warnings

.PHONY: lint  ## Lint rust and python source files
lint: lint-python lint-rust

.PHONY: format-python  ## Auto-format python source files
format-python:
	uv run ruff check --fix $(sources)
	uv run ruff format $(sources)

.PHONY: format-rust  ## Auto-format rust source files
format-rust:
	cargo fmt --all
	cargo clippy --all-features --tests --fix --lib -p fastexcel --allow-dirty --allow-staged

.PHONY: format  ## Auto-format python and rust source files
format: format-rust format-python

.PHONY: test-python  ## Run python tests
test-python: install
	uv run pytest

.PHONY: test-rust-pyo3  ## Run PyO3 rust tests
test-rust-pyo3:
	# --lib to skip integration tests
	cargo test --no-default-features --features __pyo3-tests --lib

.PHONY: test-rust-standalone  ## Run standalone rust tests
test-rust-standalone:
	cargo test --no-default-features --features __rust-tests-standalone

.PHONY: test-rust-polars  ## Run polars rust tests
test-rust-polars:
	cargo test --no-default-features --features __rust-tests-polars

.PHONY: test-rust  ## Run rust tests
test-rust: test-rust-pyo3 test-rust-standalone test-rust-polars

.PHONY: test  ## Run all tests
test: test-rust test-python

.PHONY: doc-serve  ## Serve documentation with live reload
doc-serve: build-dev
	uv run pdoc --template-directory doc-templates python/fastexcel

.PHONY: doc  ## Build documentation
doc: build-dev
	uv run pdoc --template-directory doc-templates -o docs/latest python/fastexcel
	uv run scripts/update_versions.py --version latest --docs-dir docs
	cargo doc --no-deps --lib -p fastexcel --features polars

.PHONY: doc-versioned  ## Build versioned documentation (CI usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned)
doc-versioned: build-dev
	@test -n "$(VERSION)" || (echo "ERROR: VERSION is not set. Usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned" && exit 1)
	uv run pdoc --template-directory doc-templates -o docs/$(VERSION) python/fastexcel
	uv run scripts/update_versions.py --version $(VERSION) --docs-dir docs $(if $(filter 1,$(STABLE)),--stable,)

.PHONY: all  ## Run the standard set of checks performed in CI
all: format build-dev lint test

.PHONY: benchmarks  ## Run benchmarks
benchmarks: build-wheel
	uv run pytest ./python/tests/benchmarks/speed.py

.PHONY: clean  ## Clear local caches and build artifacts
clean:
	rm -rf `find . -name __pycache__`
	rm -f `find . -type f -name '*.py[co]' `
	rm -f `find . -type f -name '*~' `
	rm -f `find . -type f -name '.*~' `
	rm -rf .cache
	rm -rf htmlcov
	rm -rf .pytest_cache
	rm -rf *.egg-info
	rm -f .coverage
	rm -f .coverage.*
	rm -rf build
	rm -rf perf.data*
	rm -rf python/fastexcel/*.so

.PHONY: help  ## Display this message
help:
	@grep -E \
		'^.PHONY: .*?## .*$$' $(MAKEFILE_LIST) | \
		sort | \
		awk 'BEGIN {FS = ".PHONY: |## "}; {printf "\033[36m%-19s\033[0m %s\n", $$2, $$3}'


================================================
FILE: README.md
================================================
# `fastexcel`

A fast excel file reader for Python and Rust.

Docs:
 * [Python](https://fastexcel.toucantoco.dev/).
 * [Rust](https://docs.rs/fastexcel).

## Stability

The Python library is considered production-ready. The API is mostly stable, and we avoid breaking changes as much as
possible. v1.0.0 will be released once the [milestone](https://github.com/ToucanToco/fastexcel/milestone/2) is reached.

> ⚠️ The free-threaded build is still considered experimental

The Rust crate is still experimental, and breaking changes are to be expected.

## Installation

```bash
# Lightweight installation (no PyArrow dependency)
pip install fastexcel

# With Polars support only (no PyArrow needed)
pip install fastexcel[polars]

# With Pandas support (includes PyArrow)
pip install fastexcel[pandas]

# With PyArrow support
pip install fastexcel[pyarrow]

# With all integrations
pip install fastexcel[pandas,polars]
```

## Quick Start

### Modern usage (recommended)

FastExcel supports the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for zero-copy data exchange with libraries like Polars, without requiring pyarrow as a dependency.
Use fastexcel with any Arrow-compatible library without requiring pyarrow.

```python
import fastexcel

# Load an Excel file
reader = fastexcel.read_excel("data.xlsx")
sheet = reader.load_sheet(0)  # Load first sheet

# Use with Polars (zero-copy, no pyarrow needed)
import polars as pl
df = pl.DataFrame(sheet)  # Direct PyCapsule interface
print(df)

# Or use the to_polars() method (also via PyCapsule)
df = sheet.to_polars()
print(df)

# Or access the raw Arrow data via PyCapsule interface
schema = sheet.__arrow_c_schema__()
array_data = sheet.__arrow_c_array__()
```

### Traditional usage (with pandas/pyarrow)

```python
import fastexcel

reader = fastexcel.read_excel("data.xlsx")
sheet = reader.load_sheet(0)

# Convert to pandas (requires `pandas` extra)
df = sheet.to_pandas()

# Or get pyarrow RecordBatch directly
record_batch = sheet.to_arrow()
```

### Working with tables

```python
reader = fastexcel.read_excel("data.xlsx")

# List available tables
tables = reader.table_names()
print(f"Available tables: {tables}")

# Load a specific table
table = reader.load_table("MyTable")
df = pl.DataFrame(table)  # Zero-copy via PyCapsule, no pyarrow needed
```

## Key Features

- **Zero-copy data exchange** via [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
- **Flexible dependencies** - use with Polars (no PyArrow needed) or Pandas (includes PyArrow)
- **Seamless Polars integration** - `pl.DataFrame(sheet)` and `sheet.to_polars()` work without PyArrow via PyCapsule interface
- **High performance** - written in Rust with [calamine](https://github.com/tafia/calamine) and [Apache Arrow](https://arrow.apache.org/)
- **Memory efficient** - lazy loading and optional eager evaluation
- **Type safety** - automatic type inference with manual override options

## Contributing & Development

### Prerequisites

You'll need:
1. **[Rust](https://rustup.rs/)** - Rust stable or nightly
2. **[uv](https://docs.astral.sh/uv/getting-started/installation/)** - Fast Python package manager (will install Python 3.10+ automatically)
3. **[git](https://git-scm.com/)** - For version control
4. **[make](https://www.gnu.org/software/make/)** - For running development commands

**Python Version Management:**
uv handles Python installation automatically. To use a specific Python version:
```bash
uv python install 3.13  # Install Python 3.13
uv python pin 3.13      # Pin project to Python 3.13
```

### Quick Start

```bash
# Clone the repository (or from your fork)
git clone https://github.com/ToucanToco/fastexcel.git
cd fastexcel

# First-time setup: install dependencies, build debug version, and setup pre-commit hooks
make setup-dev
```

Verify your installation by running:

```bash
make
```

This runs a full development cycle: formatting, building, linting, and testing

### Development Commands

Run `make help` to see all available commands, or use these common ones:

```bash
make all          # full dev cycle: format, build, lint, test
make install      # install with debug build (daily development)
make install-prod # install with release build (benchmarking)
make test         # to run the tests
make lint         # to run the linter
make format       # to format python and rust code
make doc-serve    # to serve the documentation locally
```

### Useful Resources

* [`python/fastexcel/_fastexcel.pyi`](./python/fastexcel/_fastexcel.pyi) - Python API types
* [`python/tests/`](./python/tests) - Comprehensive usage examples

## Benchmarking

For benchmarking, use `make benchmarks` which automatically builds an optimised wheel.
This is required for profiling, as dev mode builds are much slower.

### Speed benchmarks
```bash
make benchmarks
```

### Memory profiling
```bash
mprof run -T 0.01 python python/tests/benchmarks/memory.py python/tests/benchmarks/fixtures/plain_data.xls
```

## Creating a release

1. Create a PR containing a commit that only updates the version in `Cargo.toml`.
2. Once it is approved, squash and merge it into main.
3. Tag the squashed commit, and push it.
4. The `release` GitHub action will take care of the rest.

## Dev tips

* Use `cargo check` to verify that your rust code compiles, no need to go through `maturin` every time
* `cargo clippy` = 💖
* Careful with arrow constructors, they tend to allocate a lot
* [`mprof`](https://github.com/pythonprofilers/memory_profiler) and `time` go a long way for perf checks,
  no need to go fancy right from the start


================================================
FILE: doc-templates/module.html.jinja2
================================================
{% extends "default/module.html.jinja2" %}
{% block nav_title %}
    {{ super() }}
    <div id="version-switcher" style="padding: 0.5rem 1.5rem 0.5rem 0;">
        <label for="version-select" style="font-size: 0.85rem; font-weight: bold;">Version</label>
        <select id="version-select"
                style="display: block; width: 100%; margin-top: 0.25rem; padding: 0.25rem 0.4rem;
                       font-size: 0.85rem; border-radius: 4px; border: 1px solid var(--accent2);
                       background: var(--bg); color: var(--text);">
            <option>loading...</option>
        </select>
    </div>
    <script>
    (function() {
        var parts = window.location.pathname.replace(/\/+$/, '').split('/');
        // Find the version segment: first path part that looks like a version or "latest"
        var currentVersion = '';
        for (var i = 1; i < parts.length; i++) {
            if (parts[i] === 'latest' || /^v\d/.test(parts[i])) {
                currentVersion = parts[i];
                break;
            }
        }
        if (!currentVersion) {
            var switcher = document.getElementById('version-switcher');
            if (switcher) switcher.style.display = 'none';
            return;
        }

        // Build base URL (everything before the version segment)
        var idx = window.location.pathname.indexOf(currentVersion);
        var baseUrl = window.location.pathname.substring(0, idx);
        // Get the page path after the version segment
        var pagePath = window.location.pathname.substring(idx + currentVersion.length);

        fetch(baseUrl + 'versions.json')
            .then(function(r) { return r.json(); })
            .then(function(versions) {
                var select = document.getElementById('version-select');
                select.innerHTML = '';
                versions.forEach(function(v) {
                    var opt = document.createElement('option');
                    opt.value = baseUrl + v.path + pagePath + window.location.search + window.location.hash;
                    opt.textContent = v.label;
                    if (v.path === currentVersion) opt.selected = true;
                    select.appendChild(opt);
                });
                select.onchange = function() {
                    if (this.value) window.location.href = this.value;
                };
            })
            .catch(function() {
                var select = document.getElementById('version-select');
                var opt = document.createElement('option');
                select.innerHTML = '';
                opt.textContent = currentVersion || 'unknown';
                select.appendChild(opt);
            });
    })();
    </script>
{% endblock %}


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["maturin>=1.7.0,<2.0"]
build-backend = "maturin"

[project]
name = "fastexcel"
description = "A fast excel file reader for Python, written in Rust"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
    "Programming Language :: Rust",
    "Programming Language :: Python",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3 :: Only",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: 3.14",
    "Programming Language :: Python :: Implementation :: CPython",
    "Programming Language :: Python :: Free Threading :: 1 - Unstable"
]
dependencies = ["typing-extensions>=4.0.0; python_version<'3.10'"]
dynamic = ["version"]

[project.optional-dependencies]
pyarrow = ["pyarrow>=8.0.0"]
pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
polars = ["polars>=1"]

[dependency-groups]
dev = ["maturin>=1.7.0,<2.0"]
testing = [
    { include-group = "dev" },
    "pytest>=7.1.3",
    "pytest-benchmark>=4.0.0,<6",
    "pytest-mock>=3.1",
    "pyarrow>=8.0.0",
    "pandas>=1.4.4",
    "polars>=0.16.14",
    "openpyxl>=3.1.2,<4",
    "xlrd>=2.0.1,<3",
]
linting = [
    { include-group = "dev" },
    "mypy>=2,<3",
    "pre-commit>=2.20.0,<5",
    "ruff>=0.15",
]
docs = [{ include-group = "dev" }, "pdoc"]
all = [
    { include-group = "testing" },
    { include-group = "linting" },
    { include-group = "docs" },
]

[project.urls]
"Source Code" = "https://github.com/ToucanToco/fastexcel"
Issues = "https://github.com/ToucanToco/fastexcel"

[tool.maturin]
python-source = "python"
module-name = "fastexcel._fastexcel"
features = ["__maturin"]

[tool.mypy]
python_version = "3.10"
follow_imports = "silent"
ignore_missing_imports = true
# A few custom options
show_error_codes = true
warn_no_return = true
warn_unused_configs = true
warn_unused_ignores = true

[tool.pytest.ini_options]
testpaths = "python/tests"
log_cli = true
log_cli_level = "INFO"

[tool.ruff]
line-length = 100
target-version = "py310"

[tool.ruff.lint]
# Enable Pyflakes `E` and `F` codes by default.
select = ["E", "F", "I", "Q", "FA102", "UP"]

[tool.uv]
# this ensures that `uv run` doesn't actually build the package; a `make`
# command is needed to build
package = false
required-version = '>=0.8.4'


================================================
FILE: python/fastexcel/__init__.py
================================================
from __future__ import annotations

import typing
from collections.abc import Callable
from typing import TYPE_CHECKING, Literal, TypeAlias

if TYPE_CHECKING:
    import pandas as pd
    import polars as pl
    import pyarrow as pa

from os.path import expanduser
from pathlib import Path

try:
    import importlib.util

    importlib.util.find_spec("pyarrow")
    _PYARROW_AVAILABLE = True
except ImportError:
    _PYARROW_AVAILABLE = False

from ._fastexcel import (
    ArrowError,
    CalamineCellError,
    CalamineError,
    CannotRetrieveCellDataError,
    CellError,
    CellErrors,
    ColumnInfo,
    ColumnInfoNoDtype,
    ColumnNotFoundError,
    DefinedName,
    FastExcelError,
    InvalidParametersError,
    SheetNotFoundError,
    UnsupportedColumnTypeCombinationError,
    __version__,
    _ExcelReader,
    _ExcelSheet,
    _ExcelTable,
)
from ._fastexcel import read_excel as _read_excel

DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap: TypeAlias = "dict[str | int, DType]"
ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
DTypeFrom: TypeAlias = Literal[
    "provided_for_all", "provided_by_index", "provided_by_name", "guessed"
]
SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"]


class ExcelSheet:
    """A class representing a single sheet in an Excel File"""

    def __init__(self, sheet: _ExcelSheet) -> None:
        self._sheet = sheet

    @property
    def name(self) -> str:
        """The name of the sheet"""
        return self._sheet.name

    @property
    def width(self) -> int:
        """The sheet's width"""
        return self._sheet.width

    @property
    def height(self) -> int:
        """The sheet's height, with `skip_rows` and `nrows` applied"""
        return self._sheet.height

    @property
    def total_height(self) -> int:
        """The sheet's total height"""
        return self._sheet.total_height

    @property
    def selected_columns(self) -> list[ColumnInfo]:
        """The sheet's selected columns"""
        return self._sheet.selected_columns

    def available_columns(self) -> list[ColumnInfo]:
        """The columns available for the given sheet"""
        return self._sheet.available_columns()

    @property
    def specified_dtypes(self) -> DTypeMap | None:
        """The dtypes specified for the sheet"""
        return self._sheet.specified_dtypes

    @property
    def visible(self) -> SheetVisible:
        """The visibility of the sheet"""
        return self._sheet.visible

    def to_arrow(self) -> pa.RecordBatch:
        """Converts the sheet to a pyarrow `RecordBatch`

        Requires the `pyarrow` extra to be installed.
        """
        if not _PYARROW_AVAILABLE:
            raise ImportError(
                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
            )
        return self._sheet.to_arrow()

    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
        """Converts the sheet to a pyarrow `RecordBatch` with error information.

        Stores the positions of any values that cannot be parsed as the specified type and were
        therefore converted to None.

        Requires the `pyarrow` extra to be installed.
        """
        if not _PYARROW_AVAILABLE:
            raise ImportError(
                "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'"  # noqa: E501
            )
        rb, cell_errors = self._sheet.to_arrow_with_errors()
        if not cell_errors.errors:
            return (rb, None)
        return (rb, cell_errors)

    def to_pandas(self) -> pd.DataFrame:
        """Converts the sheet to a Pandas `DataFrame`.

        Requires the `pandas` extra to be installed.
        """
        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
        # which we don't implement. Using pyarrow conversion for now.
        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
        return self.to_arrow().to_pandas()

    def to_polars(self) -> pl.DataFrame:
        """Converts the sheet to a Polars `DataFrame`.

        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
        Requires the `polars` extra to be installed.
        """
        import polars as pl

        return pl.DataFrame(self)

    def __arrow_c_schema__(self) -> object:
        """Export the schema as an `ArrowSchema` `PyCapsule`.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """
        return self._sheet.__arrow_c_schema__()

    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.

        The optional `requested_schema` parameter allows for potential schema conversion.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """
        return self._sheet.__arrow_c_array__(requested_schema)

    def __repr__(self) -> str:
        return self._sheet.__repr__()


class ExcelTable:
    """A class representing a single table in an Excel file"""

    def __init__(self, table: _ExcelTable) -> None:
        self._table = table

    @property
    def name(self) -> str:
        """The name of the table"""
        return self._table.name

    @property
    def sheet_name(self) -> str:
        """The name of the sheet this table belongs to"""
        return self._table.sheet_name

    @property
    def width(self) -> int:
        """The table's width"""
        return self._table.width

    @property
    def height(self) -> int:
        """The table's height"""
        return self._table.height

    @property
    def total_height(self) -> int:
        """The table's total height"""
        return self._table.total_height

    @property
    def offset(self) -> int:
        """The table's offset before data starts"""
        return self._table.offset

    @property
    def selected_columns(self) -> list[ColumnInfo]:
        """The table's selected columns"""
        return self._table.selected_columns

    def available_columns(self) -> list[ColumnInfo]:
        """The columns available for the given table"""
        return self._table.available_columns()

    @property
    def specified_dtypes(self) -> DTypeMap | None:
        """The dtypes specified for the table"""
        return self._table.specified_dtypes

    def to_arrow(self) -> pa.RecordBatch:
        """Converts the table to a pyarrow `RecordBatch`

        Requires the `pyarrow` extra to be installed.
        """
        if not _PYARROW_AVAILABLE:
            raise ImportError(
                "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
            )
        return self._table.to_arrow()

    def to_pandas(self) -> pd.DataFrame:
        """Converts the table to a Pandas `DataFrame`.

        Requires the `pandas` extra to be installed.
        """
        # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
        # which we don't implement. Using pyarrow conversion for now.
        # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
        return self.to_arrow().to_pandas()

    def to_polars(self) -> pl.DataFrame:
        """Converts the table to a Polars `DataFrame`.

        Uses the Arrow PyCapsule Interface for zero-copy data exchange.
        Requires the `polars` extra to be installed.
        """
        import polars as pl

        return pl.DataFrame(self)

    def __arrow_c_schema__(self) -> object:
        """Export the schema as an `ArrowSchema` `PyCapsule`.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """
        return self._table.__arrow_c_schema__()

    def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.

        The optional `requested_schema` parameter allows for potential schema conversion.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """
        return self._table.__arrow_c_array__(requested_schema)


class ExcelReader:
    """A class representing an open Excel file and allowing to read its sheets"""

    def __init__(self, reader: _ExcelReader) -> None:
        self._reader = reader

    @property
    def sheet_names(self) -> list[str]:
        """The list of sheet names"""
        return self._reader.sheet_names

    @typing.overload
    def load_sheet(
        self,
        idx_or_name: int | str,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[False] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> ExcelSheet: ...

    @typing.overload
    def load_sheet(
        self,
        idx_or_name: int | str,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[True] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> pa.RecordBatch: ...

    def load_sheet(
        self,
        idx_or_name: int | str,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: bool = False,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> ExcelSheet | pa.RecordBatch:
        """Loads a sheet by index or name.

        :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
        :param header_row: The index of the row containing the column labels, default index is 0.
                           If `None`, the sheet does not have any column labels.
                           Any rows before the `header_row` will be automatically skipped.
        :param column_names: Overrides headers found in the document.
                             If `column_names` is used, `header_row` will be ignored.
        :param n_rows: Specifies how many rows should be loaded.
                       If `None`, all rows are loaded
        :param skip_rows: Specifies which rows should be skipped after the `header_row`.
                          Any rows before the `header_row` are automatically skipped.
                          It means row indices are relative to data rows, not the sheet!
                          Can be one of:
                          - `int`: Skip this many rows after the header row
                          - `list[int]`: Skip specific row indices (0-based relative to data rows)
                          - `Callable[[int], bool]`: Function that receives row index (0-based
                          relative to data rows) and returns True to skip the row
                          - `None`: If `header_row` is None, skips empty rows at beginning
        :param schema_sample_rows: Specifies how many rows should be used to determine
                                   the dtype of a column. Cannot be 0. A specific dtype can be
                                   enforced for some or all columns through the `dtypes` parameter.
                                   If `None`, all rows will be used.
        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
                               will try to coerce different dtypes in a column to the same one,
                               whereas `strict` will raise an error in case a column contains
                               several dtypes. Note that this only applies to columns whose dtype
                               is guessed, i.e. not specified via `dtypes`.
        :param use_columns: Specifies the columns to use. Can either be:
                            - `None` to select all columns
                            - A list of strings and ints, the column names and/or indices
                              (starting at 0)
                            - A string, a comma separated list of Excel column letters and column
                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
                              ranges (e.g. `":C"` to select columns from A to C). These can be
                              combined for "except" patterns (e.g. `":C,E:"` to select everything
                              except column D)
                            - A callable, a function that takes a column and returns a boolean
                              indicating whether the column should be used
        :param dtypes: An optional dtype (for all columns)
                       or dict of dtypes with keys as column indices or names.
        :param eager: Specifies whether the sheet should be loaded eagerly.
                      `False` (default) will load the sheet lazily using the `PyCapsule` interface,
                      whereas `True` will load it eagerly via `pyarrow`.

                      Eager loading requires the `pyarrow` extra to be installed.
        :param skip_whitespace_tail_rows: Skip rows at the end of the sheet
                                          containing only whitespace and null values.
        :param whitespace_as_null: Consider cells containing only whitespace as null values.
        """
        sheet_or_rb = self._reader.load_sheet(
            idx_or_name=idx_or_name,
            header_row=header_row,
            column_names=column_names,
            skip_rows=skip_rows,
            n_rows=n_rows,
            schema_sample_rows=schema_sample_rows,
            dtype_coercion=dtype_coercion,
            use_columns=use_columns,
            dtypes=dtypes,
            eager=eager,
            skip_whitespace_tail_rows=skip_whitespace_tail_rows,
            whitespace_as_null=whitespace_as_null,
        )
        return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)

    def table_names(self, sheet_name: str | None = None) -> list[str]:
        """The list of table names.

        Will return an empty list if no tables are found.

        :param sheet_name: If given, will limit the list to the given sheet, will be faster
        too.
        """
        return self._reader.table_names(sheet_name)

    def defined_names(self) -> list[DefinedName]:
        """The list of defined names (named ranges) in the workbook.

        Returns a list of DefinedName objects with 'name' and 'formula' attributes.
        The formula is a string representation of the range or expression.

        Will return an empty list if no defined names are found.
        """
        return self._reader.defined_names()

    @typing.overload
    def load_table(
        self,
        name: str,
        *,
        header_row: int | None = None,
        column_names: list[str] | None = None,
        skip_rows: int | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[False] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> ExcelTable: ...

    @typing.overload
    def load_table(
        self,
        name: str,
        *,
        header_row: int | None = None,
        column_names: list[str] | None = None,
        skip_rows: int | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[True] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> pa.RecordBatch: ...

    def load_table(
        self,
        name: str,
        *,
        header_row: int | None = None,
        column_names: list[str] | None = None,
        skip_rows: int | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: bool = False,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> ExcelTable | pa.RecordBatch:
        """Loads a table by name.

        :param name: The name of the table to load.
        :param header_row: The index of the row containing the column labels.
                           If `None`, the table's column names will be used.
                           Any rows before the `header_row` will be automatically skipped.
        :param column_names: Overrides headers found in the document.
                             If `column_names` is used, `header_row` will be ignored.
        :param n_rows: Specifies how many rows should be loaded.
                       If `None`, all rows are loaded
        :param skip_rows: Specifies how many rows should be skipped after the `header_row`.
                          Any rows before the `header_row` are automatically skipped.
                          If `header_row` is `None`, it skips the number of rows from the
                          start of the sheet.
        :param schema_sample_rows: Specifies how many rows should be used to determine
                                   the dtype of a column. Cannot be 0. A specific dtype can be
                                   enforced for some or all columns through the `dtypes` parameter.
                                   If `None`, all rows will be used.
        :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
                               will try to coerce different dtypes in a column to the same one,
                               whereas `strict` will raise an error in case a column contains
                               several dtypes. Note that this only applies to columns whose dtype
                               is guessed, i.e. not specified via `dtypes`.
        :param use_columns: Specifies the columns to use. Can either be:
                            - `None` to select all columns
                            - A list of strings and ints, the column names and/or indices
                              (starting at 0)
                            - A string, a comma separated list of Excel column letters and column
                              ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
                              `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
                              (e.g. `"B:"` to select all columns from B onwards) and from-beginning
                              ranges (e.g. `":C"` to select columns from A to C). These can be
                              combined for "except" patterns (e.g. `":C,E:"` to select everything
                              except column D)
                            - A callable, a function that takes a column and returns a boolean
                              indicating whether the column should be used
        :param dtypes: An optional dtype (for all columns)
                       or dict of dtypes with keys as column indices or names.
        :param eager: Specifies whether the table should be loaded eagerly.
                      `False` (default) will load the table lazily using the `PyCapsule` interface,
                      whereas `True` will load it eagerly via `pyarrow`.

                      Eager loading requires the `pyarrow` extra to be installed.
        :param skip_whitespace_tail_rows: Skip rows at the end of the table
                                          containing only whitespace and null values.
        :param whitespace_as_null: Consider cells containing only whitespace as null values.
        """
        if eager:
            return self._reader.load_table(
                name=name,
                header_row=header_row,
                column_names=column_names,
                skip_rows=skip_rows,
                n_rows=n_rows,
                schema_sample_rows=schema_sample_rows,
                dtype_coercion=dtype_coercion,
                use_columns=use_columns,
                dtypes=dtypes,
                eager=True,
                skip_whitespace_tail_rows=skip_whitespace_tail_rows,
                whitespace_as_null=whitespace_as_null,
            )
        else:
            return ExcelTable(
                self._reader.load_table(
                    name=name,
                    header_row=header_row,
                    column_names=column_names,
                    skip_rows=skip_rows,
                    n_rows=n_rows,
                    schema_sample_rows=schema_sample_rows,
                    dtype_coercion=dtype_coercion,
                    use_columns=use_columns,
                    dtypes=dtypes,
                    eager=False,
                    skip_whitespace_tail_rows=skip_whitespace_tail_rows,
                    whitespace_as_null=whitespace_as_null,
                )
            )

    def load_sheet_eager(
        self,
        idx_or_name: int | str,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str] | list[int] | str | None = None,
        dtypes: DType | DTypeMap | None = None,
    ) -> pa.RecordBatch:
        """Loads a sheet eagerly by index or name.

        For xlsx files, this will be faster and more memory-efficient, as it will use
        `worksheet_range_ref` under the hood, which returns borrowed types.

        Refer to `load_sheet` for parameter documentation

        Requires the `pyarrow` extra to be installed.
        """
        return self._reader.load_sheet(
            idx_or_name=idx_or_name,
            header_row=header_row,
            column_names=column_names,
            skip_rows=skip_rows,
            n_rows=n_rows,
            schema_sample_rows=schema_sample_rows,
            dtype_coercion=dtype_coercion,
            use_columns=use_columns,
            dtypes=dtypes,
            eager=True,
        )

    def load_sheet_by_name(
        self,
        name: str,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
    ) -> ExcelSheet:
        """Loads a sheet by name.

        Refer to `load_sheet` for parameter documentation
        """
        return self.load_sheet(
            name,
            header_row=header_row,
            column_names=column_names,
            skip_rows=skip_rows,
            n_rows=n_rows,
            schema_sample_rows=schema_sample_rows,
            dtype_coercion=dtype_coercion,
            use_columns=use_columns,
            dtypes=dtypes,
        )

    def load_sheet_by_idx(
        self,
        idx: int,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
    ) -> ExcelSheet:
        """Loads a sheet by index.

        Refer to `load_sheet` for parameter documentation
        """
        return self.load_sheet(
            idx,
            header_row=header_row,
            column_names=column_names,
            skip_rows=skip_rows,
            n_rows=n_rows,
            schema_sample_rows=schema_sample_rows,
            dtype_coercion=dtype_coercion,
            use_columns=use_columns,
            dtypes=dtypes,
        )

    def __repr__(self) -> str:
        return self._reader.__repr__()


def read_excel(source: Path | str | bytes) -> ExcelReader:
    """Opens and loads an excel file.

    :param source: The path to a file or its content as bytes
    """
    if isinstance(source, str | Path):
        source = expanduser(source)
    return ExcelReader(_read_excel(source))


__all__ = (
    # version
    "__version__",
    # main entrypoint
    "read_excel",
    # Python types
    "DType",
    "DTypeMap",
    # Excel reader
    "ExcelReader",
    # Excel sheet
    "ExcelSheet",
    # Excel table
    "ExcelTable",
    # Column metadata
    "DTypeFrom",
    "ColumnNameFrom",
    "ColumnInfo",
    # Defined names
    "DefinedName",
    # Parse error information
    "CellError",
    "CellErrors",
    # Exceptions
    "FastExcelError",
    "CannotRetrieveCellDataError",
    "CalamineCellError",
    "CalamineError",
    "SheetNotFoundError",
    "ColumnNotFoundError",
    "ArrowError",
    "InvalidParametersError",
    "UnsupportedColumnTypeCombinationError",
)


================================================
FILE: python/fastexcel/_fastexcel.pyi
================================================
from __future__ import annotations

import typing
from collections.abc import Callable
from typing import TYPE_CHECKING, Literal

if TYPE_CHECKING:
    import pyarrow as pa

DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap = dict[str | int, DType]
ColumnNameFrom = Literal["provided", "looked_up", "generated"]
DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
SheetVisible = Literal["visible", "hidden", "veryhidden"]

class ColumnInfoNoDtype:
    def __init__(
        self,
        *,
        name: str,
        index: int,
        absolute_index: int,
        column_name_from: ColumnNameFrom,
    ) -> None: ...
    @property
    def name(self) -> str: ...
    @property
    def index(self) -> int: ...
    @property
    def absolute_index(self) -> int: ...
    @property
    def column_name_from(self) -> ColumnNameFrom: ...

class ColumnInfo:
    def __init__(
        self,
        *,
        name: str,
        index: int,
        absolute_index: int,
        column_name_from: ColumnNameFrom,
        dtype: DType,
        dtype_from: DTypeFrom,
    ) -> None: ...
    @property
    def name(self) -> str: ...
    @property
    def index(self) -> int: ...
    @property
    def absolute_index(self) -> int: ...
    @property
    def dtype(self) -> DType: ...
    @property
    def column_name_from(self) -> ColumnNameFrom: ...
    @property
    def dtype_from(self) -> DTypeFrom: ...

class DefinedName:
    def __init__(
        self,
        *,
        name: str,
        formula: str,
    ) -> None: ...
    @property
    def name(self) -> str: ...
    @property
    def formula(self) -> str: ...

class CellError:
    @property
    def position(self) -> tuple[int, int]: ...
    @property
    def row_offset(self) -> int: ...
    @property
    def offset_position(self) -> tuple[int, int]: ...
    @property
    def detail(self) -> str: ...
    def __repr__(self) -> str: ...

class CellErrors:
    @property
    def errors(self) -> list[CellError]: ...
    def __repr__(self) -> str: ...

class _ExcelSheet:
    @property
    def name(self) -> str:
        """The name of the sheet"""
    @property
    def width(self) -> int:
        """The sheet's width"""
    @property
    def height(self) -> int:
        """The sheet's height"""
    @property
    def total_height(self) -> int:
        """The sheet's total height"""
    @property
    def offset(self) -> int:
        """The sheet's offset before data starts"""
    @property
    def selected_columns(self) -> list[ColumnInfo]:
        """The sheet's selected columns"""
    def available_columns(self) -> list[ColumnInfo]:
        """The columns available for the given sheet"""
    @property
    def specified_dtypes(self) -> DTypeMap | None:
        """The dtypes specified for the sheet"""
    @property
    def visible(self) -> SheetVisible:
        """The visibility of the sheet"""
    def to_arrow(self) -> pa.RecordBatch:
        """Converts the sheet to a pyarrow `RecordBatch`

        Requires the `pyarrow` extra to be installed.
        """
    def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]:
        """Converts the sheet to a pyarrow `RecordBatch` with error information.

        Stores the positions of any values that cannot be parsed as the specified type and were
        therefore converted to None.

        Requires the `pyarrow` extra to be installed.
        """
    def __arrow_c_schema__(self) -> object:
        """Export the schema as an `ArrowSchema` `PyCapsule`.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """
    def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.

        The optional `requested_schema` parameter allows for potential schema conversion.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """

class _ExcelTable:
    @property
    def name(self) -> str:
        """The name of the table"""
    @property
    def sheet_name(self) -> str:
        """The name of the sheet this table belongs to"""
    @property
    def width(self) -> int:
        """The table's width"""
    @property
    def height(self) -> int:
        """The table's height"""
    @property
    def total_height(self) -> int:
        """The table's total height"""
    @property
    def offset(self) -> int:
        """The table's offset before data starts"""
    @property
    def selected_columns(self) -> list[ColumnInfo]:
        """The table's selected columns"""
    def available_columns(self) -> list[ColumnInfo]:
        """The columns available for the given table"""
    @property
    def specified_dtypes(self) -> DTypeMap | None:
        """The dtypes specified for the table"""
    def to_arrow(self) -> pa.RecordBatch:
        """Converts the table to a pyarrow `RecordBatch`

        Requires the `pyarrow` extra to be installed.
        """
    def __arrow_c_schema__(self) -> object:
        """Export the schema as an `ArrowSchema` `PyCapsule`.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """

    def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
        """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.

        The optional `requested_schema` parameter allows for potential schema conversion.

        https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export

        The Arrow PyCapsule Interface enables zero-copy data exchange with
        Arrow-compatible libraries without requiring PyArrow as a dependency.
        """

class _ExcelReader:
    """A class representing an open Excel file and allowing to read its sheets"""

    @typing.overload
    def load_sheet(
        self,
        idx_or_name: str | int,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[False] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> _ExcelSheet: ...
    @typing.overload
    def load_sheet(
        self,
        idx_or_name: str | int,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[True] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> pa.RecordBatch: ...
    @typing.overload
    def load_sheet(
        self,
        idx_or_name: str | int,
        *,
        header_row: int | None = 0,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: bool = False,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> pa.RecordBatch: ...
    @typing.overload
    def load_table(
        self,
        name: str,
        *,
        header_row: int | None = None,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[False] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> _ExcelTable: ...
    @typing.overload
    def load_table(
        self,
        name: str,
        *,
        header_row: int | None = None,
        column_names: list[str] | None = None,
        skip_rows: int | list[int] | Callable[[int], bool] | None = None,
        n_rows: int | None = None,
        schema_sample_rows: int | None = 1_000,
        dtype_coercion: Literal["coerce", "strict"] = "coerce",
        use_columns: list[str]
        | list[int]
        | str
        | Callable[[ColumnInfoNoDtype], bool]
        | None = None,
        dtypes: DType | DTypeMap | None = None,
        eager: Literal[True] = ...,
        skip_whitespace_tail_rows: bool = False,
        whitespace_as_null: bool = False,
    ) -> pa.RecordBatch: ...
    @property
    def sheet_names(self) -> list[str]: ...
    def table_names(self, sheet_name: str | None = None) -> list[str]: ...
    def defined_names(self) -> list[DefinedName]: ...

def read_excel(source: str | bytes) -> _ExcelReader:
    """Reads an excel file and returns an ExcelReader"""

__version__: str

# Exceptions
class FastExcelError(Exception): ...
class UnsupportedColumnTypeCombinationError(FastExcelError): ...
class CannotRetrieveCellDataError(FastExcelError): ...
class CalamineCellError(FastExcelError): ...
class CalamineError(FastExcelError): ...
class SheetNotFoundError(FastExcelError): ...
class ColumnNotFoundError(FastExcelError): ...
class ArrowError(FastExcelError): ...
class InvalidParametersError(FastExcelError): ...


================================================
FILE: python/fastexcel/py.typed
================================================


================================================
FILE: python/tests/__init__.py
================================================


================================================
FILE: python/tests/benchmarks/README.md
================================================
# Benchmarks

These benchmarks were generated using `pytest-benchmark`.

> **_NOTE:_**  formulas.xlsx was found [here](https://foss.heptapod.net/openpyxl/openpyxl/-/issues/494) plain_data.xls and plain_data.xlsx can be found [here](https://public.opendatasoft.com/explore/dataset/covid-19-pandemic-worldwide-data/export/?disjunctive.zone&disjunctive.category)

Using the following command:

```bash
make benchmarks
```

The results are from my local machine. This is not 100% accurate.

## Speed
### 'xls': 2 tests
|Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds|Iterations|
|-----------------|---|---|----|------|------|---|-------|---|-------|----------|
|test_fastexcel_xls|27.0991 (1.0)|33.7495 (1.0)|29.5819 (1.0)|1.6429 (1.0)|29.3559 (1.0)|2.7158 (1.0)|10;0|33.8044 (1.0)|29|1|
|test_xlrd|596.5040 (22.01)|628.7964 (18.63)|612.5730 (20.71)|12.9967 (7.91)|615.1620 (20.96)|20.7911 (7.66)|2;0|1.6325 (0.05)|5|1|


### 'xlsx': 4 tests
|Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds  Iterations|
|-----------------|---|---|----|------|------|---|--------|---|------------------|
|test_fastexcel_xlsx|437.5810 (1.0)|470.7615 (1.0)|457.9611 (1.0)|13.7401 (1.0)|457.7006 (1.0)|21.0743 (1.25)|1;0|2.1836 (1.0)|5|1|
|test_fastexcel_with_formulas|3,106.7454 (7.10)|3,150.2050 (6.69)|3,122.5234 (6.82)|16.6031 (1.21)|3,120.9000 (6.82)|16.8614 (1.0)|1;0  0.3203 (0.15)|5|1|
|test_pyxl|4,780.2341 (10.92)|4,998.7753 (10.62)|4,899.6885 (10.70)|110.4665 (8.04)|4,948.7550 (10.81)|211.6149 (12.55)|2;0|0.2041 (0.09)|5|1|
test_pyxl_with_formulas|25,312.8494 (57.85)|26,621.4687 (56.55)|25,808.5418 (56.36)|545.0540 (39.67)|25,748.0901 (56.26)|852.3171 (50.55)|1;0|0.0387 (0.02)|5|1|


## Memory usage

| fastexcel memory usage | other memory usage |
|-|-|
|![fastexcel xls](memory_profiles/test_xls_fastexcel.png "fastexcel xls") |![xlrd xls](memory_profiles/test_xls_xlrd.png "xlrd xls")|
|![fastexcel xlsx](memory_profiles/test_xlsx_fastexcel.png "fastexcel xlsx") |![pyxl xlsx](memory_profiles/test_xlsx_openpyxl.png "pyxl xlsx")|
|![fastexcel formulas xlsx](memory_profiles/test_xlsx_formulas_fastexcel.png "fastexcel formulas xlsx") |![pyxl formulas xlsx](memory_profiles/test_xlsx_formulas_openpyxl.png "pyxl formulas xlsx")|


================================================
FILE: python/tests/benchmarks/fixtures/formulas.xlsx
================================================
[File too large to display: 46.5 MB]

================================================
FILE: python/tests/benchmarks/memory.py
================================================
import argparse
from enum import Enum

from .readers import fastexcel_read, pyxl_read, xlrd_read


class Engine(str, Enum):
    FASTEXCEL = "fastexcel"
    XLRD = "xlrd"
    OPENPYXL = "pyxl"


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--engine", default=Engine.FASTEXCEL)
    parser.add_argument("file")
    return parser.parse_args()


def main():
    args = get_args()
    engine = args.engine

    if engine == Engine.FASTEXCEL:
        fastexcel_read(args.file)
    elif engine == Engine.XLRD:
        xlrd_read(args.file)
    elif engine == Engine.OPENPYXL:
        pyxl_read(args.file)


if __name__ == "__main__":
    main()


================================================
FILE: python/tests/benchmarks/readers.py
================================================
from fastexcel import read_excel
from openpyxl import load_workbook
from xlrd import open_workbook


def pyxl_read(test_file_path: str):
    wb = load_workbook(test_file_path, read_only=True, keep_links=False, data_only=True)
    for ws in wb:
        rows = ws.iter_rows()
        rows = ws.values
        for row in rows:
            for _ in row:
                pass


def xlrd_read(test_file_path: str):
    wb = open_workbook(test_file_path)
    for ws in wb.sheets():
        for idx in range(ws.nrows):
            for _ in ws.row_values(idx):
                pass


def fastexcel_read(test_file_path: str):
    reader = read_excel(test_file_path)
    for sheet_name in reader.sheet_names:
        sheet = reader.load_sheet_by_name(sheet_name)
        sheet.to_arrow()


================================================
FILE: python/tests/benchmarks/speed.py
================================================
"""
Compare read performance with fastexcel, xlrd and different openpyxl options
"""

import pytest

from .readers import fastexcel_read, pyxl_read, xlrd_read


@pytest.fixture
def plain_data_xls():
    return "./python/tests/benchmarks/fixtures/plain_data.xls"


@pytest.fixture
def plain_data_xlsx():
    return "./python/tests/benchmarks/fixtures/plain_data.xlsx"


@pytest.fixture
def formula_xlsx():
    return "./python/tests/benchmarks/fixtures/formulas.xlsx"


@pytest.mark.benchmark(group="xlsx")
def test_pyxl(benchmark, plain_data_xlsx):
    benchmark(pyxl_read, plain_data_xlsx)


@pytest.mark.benchmark(group="xls")
def test_xlrd(benchmark, plain_data_xls):
    benchmark(xlrd_read, plain_data_xls)


@pytest.mark.benchmark(group="xls")
def test_fastexcel_xls(benchmark, plain_data_xls):
    benchmark(fastexcel_read, plain_data_xls)


@pytest.mark.benchmark(group="xlsx")
def test_fastexcel_xlsx(benchmark, plain_data_xlsx):
    benchmark(fastexcel_read, plain_data_xlsx)


@pytest.mark.benchmark(group="xlsx")
def test_pyxl_with_formulas(benchmark, formula_xlsx):
    benchmark(pyxl_read, formula_xlsx)


@pytest.mark.benchmark(group="xlsx")
def test_fastexcel_with_formulas(benchmark, formula_xlsx):
    benchmark(fastexcel_read, formula_xlsx)


================================================
FILE: python/tests/conftest.py
================================================
from __future__ import annotations

from datetime import datetime
from typing import Any

import pytest


@pytest.fixture
def expected_data_sheet_null_strings() -> dict[str, list[Any]]:
    return {
        "FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
        "SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
        "DATES_AND_NULLS": [
            None,
            None,
            None,
            datetime(2022, 12, 19, 0, 0),
            datetime(2022, 8, 26, 0, 0),
            datetime(2023, 5, 6, 0, 0),
            datetime(2023, 3, 20, 0, 0),
            datetime(2022, 8, 29, 0, 0),
            None,
            None,
        ],
        "TIMESTAMPS_AND_NULLS": [
            None,
            None,
            datetime(2023, 2, 18, 6, 13, 56, 730000),
            datetime(2022, 9, 20, 20, 0, 7, 50000),
            datetime(2022, 9, 24, 17, 4, 31, 236000),
            None,
            None,
            None,
            datetime(2022, 9, 14, 1, 50, 58, 390000),
            datetime(2022, 10, 21, 17, 20, 12, 223000),
        ],
        "INTS_AND_NULLS": [
            2076.0,
            2285.0,
            39323.0,
            None,
            None,
            None,
            11953.0,
            None,
            30192.0,
            None,
        ],
        "FLOATS_AND_NULLS": [
            141.02023312814603,
            778.0655928608671,
            None,
            497.60307287584106,
            627.446112513911,
            None,
            None,
            None,
            488.3509486743364,
            None,
        ],
    }


================================================
FILE: python/tests/test_alias_generation.py
================================================
from __future__ import annotations

import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture


@pytest.mark.parametrize(
    "use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]]
)
def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
    excel_reader = fastexcel.read_excel(
        path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
    )

    sheet = excel_reader.load_sheet(0, use_columns=use_columns)
    assert [col.name for col in sheet.available_columns()] == ["col", "col_1", "col_2"]

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "col": [1.0, 2.0],
                "col_1": [2019.0, 2020.0],
                "col_2": pd.Series(
                    [pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")]
                ).astype("datetime64[ms]"),
            }
        ),
    )
    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "col": [1.0, 2.0],
                "col_1": [2019.0, 2020.0],
                "col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"],
            }
        ).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
    )


================================================
FILE: python/tests/test_column_selection.py
================================================
# ruff: noqa: E501
from __future__ import annotations

import re
from typing import Any

import fastexcel
import numpy as np
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture


@pytest.fixture
def excel_reader_single_sheet() -> fastexcel.ExcelReader:
    return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))


@pytest.fixture
def expected_column_info() -> list[fastexcel.ColumnInfo]:
    return [
        fastexcel.ColumnInfo(
            name="Month",
            index=0,
            absolute_index=0,
            column_name_from="looked_up",
            dtype="float",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="Year",
            index=1,
            absolute_index=1,
            column_name_from="looked_up",
            dtype="float",
            dtype_from="guessed",
        ),
    ]


def test_single_sheet_all_columns(
    excel_reader_single_sheet: fastexcel.ExcelReader,
    expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    sheet = excel_reader_single_sheet.load_sheet(0)

    sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
    assert sheet.selected_columns == expected_column_info
    assert sheet.available_columns() == expected_column_info

    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
    expected_pd_df = pd.DataFrame(expected)
    expected_pl_df = pl.DataFrame(expected)

    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)
    pd_df_explicit_arg = sheet_explicit_arg.to_pandas()
    pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df)

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)
    pl_df_explicit_arg = sheet_explicit_arg.to_polars()
    pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df)


def test_single_sheet_subset_by_str(
    excel_reader_single_sheet: fastexcel.ExcelReader,
    expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    # looks like mypy 1.8 became more stupid
    sheets: list[str | int] = [0, "January"]
    for sheet_name_or_idx in sheets:
        for idx, col in enumerate(["Month", "Year"]):
            sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col])
            assert sheet.selected_columns == [expected_column_info[idx]]
            assert sheet.available_columns() == expected_column_info

            pd_df = sheet.to_pandas()
            pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))

            pl_df = sheet.to_polars()
            pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]}))


def test_single_sheet_subset_by_index(
    excel_reader_single_sheet: fastexcel.ExcelReader,
    expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    sheets: list[str | int] = [0, "January"]
    for sheet_name_or_idx in sheets:
        for idx, col_name in enumerate(["Month", "Year"]):
            sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx])
            assert sheet.selected_columns == [expected_column_info[idx]]
            assert sheet.available_columns() == expected_column_info

            pd_df = sheet.to_pandas()
            pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))

            pl_df = sheet.to_polars()
            pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]}))


@pytest.fixture
def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader:
    return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))


@pytest.fixture
def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]:
    return {
        "col1": [2.0, 3.0],
        "__UNNAMED__1": [1.5, 2.5],
        "col3": ["hello", "world"],
        "__UNNAMED__3": [-5.0, -6.0],
        "col5": ["a", "b"],
    }


@pytest.fixture
def sheet_with_unnamed_columns_expected_column_info() -> list[fastexcel.ColumnInfo]:
    return [
        fastexcel.ColumnInfo(
            name="col1",
            index=0,
            absolute_index=0,
            column_name_from="looked_up",
            dtype="float",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__1",
            index=1,
            absolute_index=1,
            column_name_from="generated",
            dtype="float",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="col3",
            index=2,
            absolute_index=2,
            column_name_from="looked_up",
            dtype="string",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__3",
            index=3,
            absolute_index=3,
            column_name_from="generated",
            dtype="float",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="col5",
            index=4,
            absolute_index=4,
            column_name_from="looked_up",
            dtype="string",
            dtype_from="guessed",
        ),
    ]


def test_single_sheet_with_unnamed_columns(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    use_columns_str = ["col1", "col3", "__UNNAMED__3"]
    use_columns_idx = [0, 2, 3]
    expected = {
        k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str
    }

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == [
        sheet_with_unnamed_columns_expected_column_info[0],
        sheet_with_unnamed_columns_expected_column_info[2],
        sheet_with_unnamed_columns_expected_column_info[3],
    ]
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_idx
    )
    assert sheet.selected_columns == [
        sheet_with_unnamed_columns_expected_column_info[0],
        sheet_with_unnamed_columns_expected_column_info[2],
        sheet_with_unnamed_columns_expected_column_info[3],
    ]
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_pagination(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    use_columns_str = ["col1", "col3", "__UNNAMED__3"]
    use_columns_idx = [0, 2, 3]

    # first row only
    expected = {
        k: v[:1]
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in use_columns_str
    }

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str, n_rows=1
    )
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_idx, n_rows=1
    )
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

    # second row
    expected = {
        k: v[1:]
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in use_columns_str
    }

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str, skip_rows=1
    )
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_idx, skip_rows=1
    )
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
    use_columns_str = ["col0", "col2", "col3"]
    use_columns_idx = [0, 2, 3]
    expected: dict[str, list[Any]] = {
        "col0": [2.0, 3.0],
        "col1": ["hello", "world"],
        "col2": [-5.0, -6.0],
    }
    column_names = [f"col{i}" for i in range(3)]
    expected_columns_names = ["col0", "__UNNAMED__1", "col1", "col2", "__UNNAMED__4"]

    # skipping the header row only
    with pytest.raises(
        fastexcel.InvalidParametersError,
        match='use_columns can only contain integers when used with columns_names, got "col0"',
    ):
        excel_reader_single_sheet_with_unnamed_columns.load_sheet(
            "With unnamed columns",
            use_columns=use_columns_str,
            skip_rows=1,
            column_names=column_names,
        )

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
    )
    assert [col.name for col in sheet.available_columns()] == expected_columns_names

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

    # skipping the header row + first data row
    expected_first_row_skipped = {k: v[1:] for k, v in expected.items()}

    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
    )
    assert [col.name for col in sheet.available_columns()] == expected_columns_names

    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))


def test_single_sheet_with_unnamed_columns_and_str_range(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    use_columns_str = "A,C:E"
    expected = {
        k: v
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in ["col1", "col3", "__UNNAMED__3", "col5"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == (
        sheet_with_unnamed_columns_expected_column_info[:1]
        + sheet_with_unnamed_columns_expected_column_info[2:]
    )
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_open_ended_range(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test B: (should get columns B, C, D, E - indices 1, 2, 3, 4)
    use_columns_str = "B:"
    expected = {
        k: v
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in ["__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[1:]
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_open_ended_range_from_start(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test A: (should get all columns)
    use_columns_str = "A:"
    expected = single_sheet_with_unnamed_columns_expected
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_mixed_open_ended_range(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test A,C: (should get column A and columns from C onwards - indices 0, 2, 3, 4)
    use_columns_str = "A,C:"
    expected = {
        k: v
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in ["col1", "col3", "__UNNAMED__3", "col5"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    expected_selected_cols = [
        sheet_with_unnamed_columns_expected_column_info[0]
    ] + sheet_with_unnamed_columns_expected_column_info[2:]
    assert sheet.selected_columns == expected_selected_cols
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_from_beginning_range(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test :C (should get columns A, B, C - indices 0, 1, 2)
    use_columns_str = ":C"
    expected = {
        k: v
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in ["col1", "__UNNAMED__1", "col3"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[:3]
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_from_beginning_range_single_column(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test :A (should get only column A - index 0)
    use_columns_str = ":A"
    expected = {
        k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    assert sheet.selected_columns == [sheet_with_unnamed_columns_expected_column_info[0]]
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_with_unnamed_columns_and_complex_mixed_pattern(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
    single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
    sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
    # Test A,:B,D,E: (should get A, A,B again (deduplicated), D, and E)
    # This effectively becomes A,B,D,E (columns 0,1,3,4)
    use_columns_str = "A,:B,D,E:"
    expected = {
        k: v
        for k, v in single_sheet_with_unnamed_columns_expected.items()
        if k in ["col1", "__UNNAMED__1", "__UNNAMED__3", "col5"]
    }
    sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
        "With unnamed columns", use_columns=use_columns_str
    )
    # Expected: columns A, A,B (from :B), D, E (from E:)
    # After deduplication: 0,1,3,4
    expected_selected_cols = [
        sheet_with_unnamed_columns_expected_column_info[0],  # A
        sheet_with_unnamed_columns_expected_column_info[1],  # B
        sheet_with_unnamed_columns_expected_column_info[3],  # D
        sheet_with_unnamed_columns_expected_column_info[4],  # E
    ]
    assert sheet.selected_columns == expected_selected_cols
    assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_single_sheet_invalid_column_indices_negative_integer(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
    expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
    0: could not determine selected columns from provided object: [-2]
    1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2])
"""
    with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])


def test_single_sheet_invalid_column_indices_empty_list(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
    expected_message = """invalid parameters: list of selected columns is empty
Context:
    0: could not determine selected columns from provided object: []
    1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([])
"""
    with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])


def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
    expected_message = """column with name \"nope\" not found
Context:
    0: available columns are: .*
"""
    with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"])


def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
    excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
    expected_message = """column at index 42 not found
Context:
    0: available columns are: .*
"""
    with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
        excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])


def test_use_columns_with_column_names() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))

    sheet = excel_reader.load_sheet(
        0,
        use_columns=[1, 2],
        header_row=None,
        skip_rows=1,
        column_names=["bools_renamed", "dates_renamed"],
    )

    assert sheet.available_columns() == [
        fastexcel.ColumnInfo(
            name="__UNNAMED__0",
            column_name_from="generated",
            index=0,
            absolute_index=0,
            dtype="float",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="bools_renamed",
            index=1,
            absolute_index=1,
            dtype="boolean",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="dates_renamed",
            index=2,
            absolute_index=2,
            dtype="datetime",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__3",
            index=3,
            absolute_index=3,
            dtype="float",
            dtype_from="guessed",
            column_name_from="generated",
        ),
    ]

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "bools_renamed": [True, False, True],
                "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
                    "datetime64[ms]"
                ),
            }
        ),
    )
    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "bools_renamed": [True, False, True],
                "dates_renamed": ["2022-03-02 05:43:04"] * 3,
            }
        ).with_columns(
            pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
        ),
    )


def test_use_columns_with_callable() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

    sheet = excel_reader.load_sheet(2)
    assert (
        [(c.name, c.dtype) for c in sheet.available_columns()]
        == [(c.name, c.dtype) for c in sheet.selected_columns]
        == [
            ("col1", "float"),
            ("__UNNAMED__1", "float"),
            ("col3", "string"),
            ("__UNNAMED__3", "float"),
            ("col5", "string"),
        ]
    )

    sheet = excel_reader.load_sheet(
        2,
        use_columns=lambda col: col.name.startswith("col"),
    )
    assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
        ("col1", "float"),
        ("col3", "string"),
        ("col5", "string"),
    ]

    sheet = excel_reader.load_sheet(
        2,
        use_columns=lambda col: col.index % 2 == 1,
    )
    assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
        ("__UNNAMED__1", "float"),
        ("__UNNAMED__3", "float"),
    ]


def test_use_columns_with_bad_callable() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
    with pytest.raises(
        fastexcel.InvalidParametersError,
        match=re.escape("`use_columns` callable could not be called (TypeError: "),
    ):
        excel_reader.load_sheet(
            2,
            use_columns=lambda: True,  # type: ignore
        )

    with pytest.raises(
        fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean"
    ):
        excel_reader.load_sheet(
            2,
            use_columns=lambda _: 42,  # type: ignore
        )


def test_use_columns_with_eager_loading() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    expected_months = [1.0, 2.0]
    expected_years = [2019.0, 2020.0]

    # default
    rb = excel_reader.load_sheet_eager(0)
    assert rb.schema.names == ["Month", "Year"]
    assert rb["Year"].tolist() == expected_years
    assert rb["Month"].tolist() == expected_months

    # changing order
    rb = excel_reader.load_sheet_eager(0, use_columns=["Year", "Month"])
    assert rb.schema.names == ["Year", "Month"]
    assert rb["Year"].tolist() == expected_years
    assert rb["Month"].tolist() == expected_months

    # subset
    rb = excel_reader.load_sheet_eager(0, use_columns=["Year"])
    assert rb.schema.names == ["Year"]
    assert rb["Year"].tolist() == expected_years
    assert "Month" not in (field.name for field in rb.schema)


@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_use_columns_dtypes_eager_loading(
    excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]
) -> None:
    expected_pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
        pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
        pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
    )
    expected_pd_df = pd.DataFrame(expected_data_sheet_null_strings)
    expected_pd_df["DATES_AND_NULLS"] = expected_pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
    expected_pd_df["TIMESTAMPS_AND_NULLS"] = expected_pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")

    for use_columns in (
        list(expected_data_sheet_null_strings.keys()),
        [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2],
        [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2 == 0],
        list(reversed(expected_data_sheet_null_strings.keys())),
        [
            key
            for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
            if idx % 2
        ],
        [
            key
            for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
            if idx % 2 == 0
        ],
    ):
        excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
        sheet = excel_reader.load_sheet_eager(0, use_columns=use_columns)
        pd_df = sheet.to_pandas()
        pl_df = pl.from_arrow(data=sheet)
        assert isinstance(pl_df, pl.DataFrame)
        sheet_lazy = excel_reader.load_sheet(0, use_columns=use_columns)
        pl_df_lazy = sheet_lazy.to_polars()
        pd_df_lazy = sheet_lazy.to_pandas()

        pl_assert_frame_equal(pl_df_lazy, pl_df)
        pd_assert_frame_equal(pd_df_lazy, pd_df)

        pl_assert_frame_equal(expected_pl_df.select(use_columns), pl_df)
        pd_assert_frame_equal(expected_pd_df[use_columns], pd_df)

        assert pd_df.columns.to_list() == use_columns
        assert pl_df.columns == use_columns


def test_use_columns_with_table() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))

    table = excel_reader.load_table("users", use_columns=["User Id", "FirstName"])

    expected_available_columns = [
        fastexcel.ColumnInfo(
            name="User Id",
            index=0,
            absolute_index=0,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="FirstName",
            index=1,
            absolute_index=1,
            dtype="string",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__2",
            index=2,
            absolute_index=2,
            dtype="string",
            column_name_from="generated",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__3",
            index=3,
            absolute_index=3,
            dtype="datetime",
            column_name_from="generated",
            dtype_from="guessed",
        ),
    ]

    expected_selected_columns = [
        fastexcel.ColumnInfo(
            name="User Id",
            index=0,
            absolute_index=0,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="FirstName",
            index=1,
            absolute_index=1,
            dtype="string",
            column_name_from="provided",
            dtype_from="guessed",
        ),
    ]

    assert table.available_columns() == expected_available_columns
    assert table.selected_columns == expected_selected_columns

    expected_pl_df = pl.DataFrame(
        {"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
    )
    expected_pd_df = pd.DataFrame(
        {"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
    )

    pl_df = table.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)

    pd_df = table.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)


def test_use_columns_with_table_and_provided_columns() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))

    table = excel_reader.load_table(
        "users", use_columns=[0, 2], column_names=["user_id", "last_name"]
    )

    expected_available_columns = [
        fastexcel.ColumnInfo(
            name="user_id",
            index=0,
            absolute_index=0,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__1",
            index=1,
            absolute_index=1,
            dtype="string",
            column_name_from="generated",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="last_name",
            index=2,
            absolute_index=2,
            dtype="string",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__3",
            index=3,
            absolute_index=3,
            dtype="datetime",
            column_name_from="generated",
            dtype_from="guessed",
        ),
    ]

    expected_selected_columns = [
        fastexcel.ColumnInfo(
            name="user_id",
            index=0,
            absolute_index=0,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="last_name",
            index=2,
            absolute_index=2,
            dtype="string",
            column_name_from="provided",
            dtype_from="guessed",
        ),
    ]

    assert table.available_columns() == expected_available_columns
    assert table.selected_columns == expected_selected_columns

    expected_pl_df = pl.DataFrame(
        {"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
    )
    expected_pd_df = pd.DataFrame(
        {"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
    )

    pl_df = table.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)

    pd_df = table.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)


def test_use_column_range_with_offset_without_table() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))

    sheet = excel_reader.load_sheet("without-table", use_columns="H:I", header_row=9)

    expected_pl_df = pl.DataFrame(
        {
            "Column at H10": [1.0, 2.0, 3.0],
            "Column at I10": [4.0, 5.0, 6.0],
        }
    )

    expected_pd_df = pd.DataFrame(
        {
            "Column at H10": [1.0, 2.0, 3.0],
            "Column at I10": [4.0, 5.0, 6.0],
        }
    )

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)

    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)


def test_use_column_range_with_offset_with_table() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))

    sheet = excel_reader.load_sheet("with-table", use_columns="D:E", header_row=4)

    expected_pl_df = pl.DataFrame(
        {
            "Column at D5": [1.0, 2.0, 3.0, 4.0],
            "Column at E5": [4.0, 5.0, 6.0, 8.0],
        }
    )

    expected_pd_df = pd.DataFrame(
        {
            "Column at D5": [1.0, 2.0, 3.0, 4.0],
            "Column at E5": [4.0, 5.0, 6.0, 8.0],
        }
    )

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)

    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)


def test_use_column_names_with_offset_table_by_index_and_name() -> None:
    """Index-based selection should resolve correctly when used with an offset table.

    The selected indices should be absolute, and it should be able to handle both index-based
    and name-based selection.
    """
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))

    # Mix name-based and index-based selection
    # "Column at D5" is at table index 0, absolute index 3
    # Index 4 is absolute index for column E
    table = excel_reader.load_table("TableAtD5", use_columns=["Column at D5", 4])  # type:ignore[arg-type]

    expected_selected_columns = [
        fastexcel.ColumnInfo(
            name="Column at D5",
            index=0,
            absolute_index=3,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
        fastexcel.ColumnInfo(
            name="Column at E5",
            index=1,
            absolute_index=4,
            dtype="float",
            column_name_from="provided",
            dtype_from="guessed",
        ),
    ]

    assert table.selected_columns == expected_selected_columns

    expected_pl_df = pl.DataFrame(
        {
            "Column at D5": [1.0, 2.0, 3.0, 4.0],
            "Column at E5": [4.0, 5.0, 6.0, 8.0],
        }
    )
    expected_pd_df = pd.DataFrame(
        {
            "Column at D5": [1.0, 2.0, 3.0, 4.0],
            "Column at E5": [4.0, 5.0, 6.0, 8.0],
        }
    )

    pl_df = table.to_polars()
    pl_assert_frame_equal(pl_df, expected_pl_df)

    pd_df = table.to_pandas()
    pd_assert_frame_equal(pd_df, expected_pd_df)


def test_use_column_range_with_offset_with_table_and_specified_dtypes() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))

    table_closed = excel_reader.load_table(
        "TableAtD5", use_columns="D:E", dtypes={3: "int", "Column at E5": "string"}
    )

    table_open_ended = excel_reader.load_table(
        "TableAtD5", use_columns="D:", dtypes={3: "int", "Column at E5": "string"}
    )

    expected_data = {
        # Dtype should be int, looked up by index
        "Column at D5": [1, 2, 3, 4],
        # Dtype should be string, looked up by name
        "Column at E5": ["4", "5", "6", "8"],
    }
    expected_column_info = [
        fastexcel.ColumnInfo(
            name="Column at D5",
            index=0,
            absolute_index=3,
            dtype="int",
            dtype_from="provided_by_index",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="Column at E5",
            index=1,
            absolute_index=4,
            dtype="string",
            dtype_from="provided_by_name",
            column_name_from="provided",
        ),
    ]

    assert table_closed.selected_columns == expected_column_info
    assert table_open_ended.selected_columns == expected_column_info

    expected_pl_df = pl.DataFrame(expected_data)
    expected_pd_df = pd.DataFrame(expected_data)

    pl_df_closed = table_closed.to_polars()
    pl_assert_frame_equal(pl_df_closed, expected_pl_df)

    pl_df_open_ended = table_open_ended.to_polars()
    pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)

    pd_df_closed = table_closed.to_pandas()
    pd_assert_frame_equal(pd_df_closed, expected_pd_df)

    pd_df_open_ended = table_open_ended.to_pandas()
    pd_assert_frame_equal(pd_df_open_ended, expected_pd_df)


def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))

    sheet_closed = excel_reader.load_sheet(
        "without-table",
        use_columns="H:K",
        header_row=9,
        dtypes={7: "int", "Column at I10": "string"},
    )

    sheet_open_ended = excel_reader.load_sheet(
        "without-table",
        use_columns="H:",
        header_row=9,
        dtypes={7: "int", "Column at I10": "string"},
    )

    expected_data_polars = {
        # Dtype should be int, looked up by index
        "Column at H10": [1, 2, 3],
        # Dtype should be string, looked up by name
        "Column at I10": ["4", "5", "6"],
        "__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String),
        "Column at K10": [7.0, 8.0, 9.0],
    }
    # In pandas 3, string columns use nan instead of None for missing values
    pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
    na_value = np.nan if pd_version >= (3, 0) else None

    expected_data_pandas = {
        # Dtype should be int, looked up by index
        "Column at H10": [1, 2, 3],
        # Dtype should be string, looked up by name
        "Column at I10": ["4", "5", "6"],
        "__UNNAMED__2": [na_value, na_value, na_value],
        "Column at K10": [7.0, 8.0, 9.0],
    }
    expected_column_info = [
        fastexcel.ColumnInfo(
            name="Column at H10",
            index=0,
            absolute_index=7,
            dtype="int",
            dtype_from="provided_by_index",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Column at I10",
            index=1,
            absolute_index=8,
            dtype="string",
            dtype_from="provided_by_name",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__2",
            index=2,
            absolute_index=9,
            dtype="string",
            dtype_from="guessed",
            column_name_from="generated",
        ),
        fastexcel.ColumnInfo(
            name="Column at K10",
            index=3,
            absolute_index=10,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
    ]

    assert sheet_closed.selected_columns == expected_column_info
    assert sheet_open_ended.selected_columns == expected_column_info

    expected_pl_df = pl.DataFrame(expected_data_polars)
    expected_pd_df = pd.DataFrame(expected_data_pandas)

    pl_df_closed = sheet_closed.to_polars()
    pl_assert_frame_equal(pl_df_closed, expected_pl_df)

    pl_df_open_ended = sheet_open_ended.to_polars()
    pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)

    pd_df_closed = sheet_closed.to_pandas()
    pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False)

    pd_df_open_ended = sheet_open_ended.to_pandas()
    pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False)


================================================
FILE: python/tests/test_defined_names.py
================================================
import fastexcel
import pytest

from .utils import path_for_fixture


@pytest.mark.parametrize("path", ("sheet-with-defined-names.xlsx",))
def test_defined_names(path: str) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture(path))
    defined_names = excel_reader.defined_names()

    expected_defined_names = [
        fastexcel.DefinedName(name="AddingValues", formula="SUM(sheet1!$K$5:$K$6)"),
        fastexcel.DefinedName(name="DefinedRange", formula="sheet1!$A$5:$D$7"),
        fastexcel.DefinedName(name="NamedConstant", formula="3.4"),
    ]

    assert defined_names == expected_defined_names


================================================
FILE: python/tests/test_dtypes.py
================================================
from __future__ import annotations

import logging
from datetime import date, datetime
from typing import Any, Literal

import fastexcel
import numpy as np
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import get_expected_pandas_dtype, path_for_fixture


@pytest.fixture
def expected_data() -> dict[str, list[Any]]:
    return {
        "Employee ID": [
            "123456",
            "44333",
            "44333",
            "87878",
            "87878",
            "US00011",
            "135967",
            "IN86868",
            "IN86868",
        ],
        "Employee Name": [
            "Test1",
            "Test2",
            "Test2",
            "Test3",
            "Test3",
            "Test4",
            "Test5",
            "Test6",
            "Test6",
        ],
        "Date": [datetime(2023, 7, 21)] * 9,
        "Details": ["Healthcare"] * 7 + ["Something"] * 2,
        "Asset ID": ["84444"] * 7 + ["ABC123"] * 2,
        "Mixed dates": ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3,
        "Mixed bools": ["true"] * 5 + ["false"] * 3 + ["other"],
    }


def test_sheet_with_mixed_dtypes(expected_data: dict[str, list[Any]]) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
    sheet = excel_reader.load_sheet(0)

    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(pd_df, pd.DataFrame(expected_data).astype({"Date": "datetime64[ms]"}))

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(
        pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")})
    )


def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[Any]]) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

    # Since we skip rows here, the dtypes should be correctly guessed, even if we only check 5 rows
    sheet = excel_reader.load_sheet(0, schema_sample_rows=5, skip_rows=5)

    expected_data_subset = {col_name: values[5:] for col_name, values in expected_data.items()}
    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(
        pd_df, pd.DataFrame(expected_data_subset).astype({"Date": "datetime64[ms]"})
    )

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(
        pl_df,
        pl.DataFrame(expected_data_subset, schema_overrides={"Date": pl.Datetime(time_unit="ms")}),
    )

    # Guess the sheet's dtypes on 5 rows only
    sheet = excel_reader.load_sheet(0, schema_sample_rows=5)
    # String fields should not have been loaded
    expected_data["Employee ID"] = [
        123456.0,
        44333.0,
        44333.0,
        87878.0,
        87878.0,
        None,
        135967.0,
        None,
        None,
    ]
    expected_data["Asset ID"] = [84444.0] * 7 + [None] * 2
    expected_data["Mixed dates"] = [datetime(2023, 7, 21)] * 6 + [None] * 3
    expected_data["Mixed bools"] = [True] * 5 + [False] * 3 + [None]

    pd_df = sheet.to_pandas()
    pd_assert_frame_equal(
        pd_df,
        pd.DataFrame(expected_data).astype(
            {
                "Date": "datetime64[ms]",
                "Mixed dates": "datetime64[ms]",
            }
        ),
    )

    pl_df = sheet.to_polars()
    pl_assert_frame_equal(
        pl_df,
        pl.DataFrame(
            expected_data,
            schema_overrides={
                "Date": pl.Datetime(time_unit="ms"),
                "Mixed dates": pl.Datetime(time_unit="ms"),
            },
        ),
    )


@pytest.mark.parametrize("dtype_by_index", (True, False))
@pytest.mark.parametrize(
    "dtype,expected_data,expected_pl_dtype",
    [
        ("int", [123456, 44333, 44333, 87878, 87878], pl.Int64),
        ("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64),
        ("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8),
        ("boolean", [True] * 5, pl.Boolean),
        (
            "datetime",
            [datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
            pl.Datetime,
        ),
        (
            "date",
            [date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
            pl.Date,
        ),
        #  conversion to duration not supported yet
        ("duration", [pd.NaT] * 5, pl.Duration),
    ],
)
def test_sheet_with_mixed_dtypes_specify_dtypes(
    dtype_by_index: bool,
    dtype: fastexcel.DType,
    expected_data: list[Any],
    expected_pl_dtype: pl.DataType,
) -> None:
    dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
    sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5)
    assert sheet.specified_dtypes == dtypes

    pd_df = sheet.to_pandas()
    expected_pd_dtype = get_expected_pandas_dtype(dtype)
    assert pd_df["Employee ID"].dtype == expected_pd_dtype
    assert pd_df["Employee ID"].to_list() == expected_data

    pl_df = sheet.to_polars()
    assert pl_df["Employee ID"].dtype == expected_pl_dtype
    assert pl_df["Employee ID"].to_list() == (expected_data if dtype != "duration" else [None] * 5)


@pytest.mark.parametrize(
    "dtypes,expected,fastexcel_dtype,expected_pl_dtype",
    [
        (None, datetime(2023, 7, 21), "datetime", pl.Datetime),
        ({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
        ({"Date": "date"}, date(2023, 7, 21), "date", pl.Date),
        ({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
        ({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
        ({2: "date"}, date(2023, 7, 21), "date", pl.Date),
        ({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
    ],
)
def test_sheet_datetime_conversion(
    dtypes: fastexcel.DTypeMap | None,
    expected: Any,
    fastexcel_dtype: str,
    expected_pl_dtype: pl.DataType,
) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

    sheet = excel_reader.load_sheet(0, dtypes=dtypes)
    assert sheet.specified_dtypes == dtypes
    pd_df = sheet.to_pandas()
    expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype)
    assert pd_df["Date"].dtype == expected_pd_dtype
    assert pd_df["Date"].to_list() == [expected] * 9

    pl_df = sheet.to_polars()
    assert pl_df["Date"].dtype == expected_pl_dtype
    assert pl_df["Date"].to_list() == [expected] * 9


@pytest.mark.parametrize("eager", [True, False])
@pytest.mark.parametrize("dtype_coercion", ["coerce", None])
def test_dtype_coercion_behavior__coerce(
    dtype_coercion: Literal["coerce"] | None, eager: bool
) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

    kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {}
    sheet_or_rb = (
        excel_reader.load_sheet(0, eager=eager, **kwargs)  # type:ignore[call-overload]
    )
    rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()

    pd_df = rb.to_pandas()
    expected_pd_dtype = get_expected_pandas_dtype("string")
    assert pd_df["Mixed dates"].dtype == expected_pd_dtype
    assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3

    pl_df = pl.from_arrow(data=rb)
    assert isinstance(pl_df, pl.DataFrame)
    assert pl_df["Mixed dates"].dtype == pl.Utf8
    assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3


@pytest.mark.parametrize("eager", [True, False])
def test_dtype_coercion_behavior__strict_sampling_eveything(eager: bool) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

    with pytest.raises(
        fastexcel.UnsupportedColumnTypeCombinationError, match="type coercion is strict"
    ):
        if eager:
            excel_reader.load_sheet_eager(0, dtype_coercion="strict")
        else:
            excel_reader.load_sheet(0, dtype_coercion="strict").to_arrow()


@pytest.mark.parametrize("eager", [True, False])
def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))

    sheet = (
        excel_reader.load_sheet_eager(0, dtype_coercion="strict", schema_sample_rows=5)
        if eager
        else excel_reader.load_sheet(0, dtype_coercion="strict", schema_sample_rows=5).to_arrow()
    )

    pd_df = sheet.to_pandas()
    assert pd_df["Mixed dates"].dtype == "datetime64[ms]"
    assert (
        pd_df["Mixed dates"].to_list() == [pd.Timestamp("2023-07-21 00:00:00")] * 6 + [pd.NaT] * 3
    )
    assert pd_df["Asset ID"].dtype == "float64"
    assert pd_df["Asset ID"].replace(np.nan, None).to_list() == [84444.0] * 7 + [None] * 2

    pl_df = pl.from_arrow(data=sheet)
    assert isinstance(pl_df, pl.DataFrame)
    assert pl_df["Mixed dates"].dtype == pl.Datetime
    assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3
    assert pl_df["Asset ID"].dtype == pl.Float64
    assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2


def test_one_dtype_for_all() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
    sheet = excel_reader.load_sheet(0, dtypes="string")
    assert sheet.available_columns() == [
        fastexcel.ColumnInfo(
            name="Employee ID",
            index=0,
            absolute_index=0,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Employee Name",
            index=1,
            absolute_index=1,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Date",
            index=2,
            absolute_index=2,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Details",
            index=3,
            absolute_index=3,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Asset ID",
            index=4,
            absolute_index=4,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Mixed dates",
            index=5,
            absolute_index=5,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Mixed bools",
            index=6,
            absolute_index=6,
            dtype="string",
            dtype_from="provided_for_all",
            column_name_from="looked_up",
        ),
    ]
    assert sheet.to_polars().dtypes == [pl.String] * 7


def test_fallback_infer_dtypes(caplog: pytest.LogCaptureFixture) -> None:
    """it should fallback to string if it can't infer the dtype"""

    excel_reader = fastexcel.read_excel(path_for_fixture("infer-dtypes-fallback.xlsx"))
    sheet = excel_reader.load_sheet(0)

    # Ensure a warning message was logged to explain the fallback to string
    assert caplog.record_tuples == [
        (
            "fastexcel.types.dtype",
            logging.WARNING,
            "Could not determine dtype for column 1, falling back to string",
        )
    ]

    assert sheet.available_columns() == [
        fastexcel.ColumnInfo(
            name="id",
            index=0,
            absolute_index=0,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="label",
            index=1,
            absolute_index=1,
            dtype="string",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
    ]
    assert sheet.to_polars().dtypes == [pl.Float64, pl.String]


@pytest.mark.parametrize(
    ("dtype", "expected_data"),
    [
        (
            "int",
            [None] * 2
            + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 1.0, None, 1.0, 0.0]
            + [None] * 7
            + [0.0],
        ),
        (
            "float",
            [None] * 2
            + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.1, -1.0, 0.0, 1.0, 1.1, 1.0, 0.0]
            + [None] * 7
            + [0.1],
        ),
        (
            "string",
            [
                None,
                "foo",
                "-1",
                "0",
                "1",
                "0",
                "1",
                "1.1",
                "-1",
                "0",
                "1",
                "1.1",
                "true",
                "false",
                "2023-07-21 00:00:00",
                "2023-07-21 12:20:00",
                # calamine reads a time as datetimes here, which seems wrong
                "1899-12-31 12:20:00",
                "07/21/2023",
                "7/21/2023  12:20:00 PM",
                "July 23rd",
                "12:20:00",
                "0.1",
            ],
        ),
        (
            "boolean",
            [None] * 2
            + [True, False, True, False, True, True]
            + [None] * 4
            + [True, False]
            + [None] * 7
            + [True],
        ),
        (
            "datetime",
            [pd.NaT] * 2
            + [
                pd.Timestamp("1899-12-30 00:00:00"),
                pd.Timestamp("1899-12-31 00:00:00"),
                pd.Timestamp("1900-01-01 00:00:00"),
                pd.Timestamp("1899-12-31 00:00:00"),
                pd.Timestamp("1900-01-01 00:00:00"),
                pd.Timestamp("1900-01-01 02:24:00"),
            ]
            + [pd.NaT] * 6
            + [
                pd.Timestamp("2023-7-21 00:00:00"),
                pd.Timestamp("2023-7-21 12:20:00"),
                # calamine currently adds a date to a time, which is
                # questionable
                pd.Timestamp("1899-12-31 12:20:00"),
            ]
            + [pd.NaT] * 4
            + [
                # calamine converts percentages to datetimes (since it does not
                # distinguish from floats), which seems questionable
                pd.Timestamp("1899-12-31 02:24:00")
            ],
        ),
        (
            "date",
            [None] * 2
            + [
                pd.Timestamp("1899-12-30").date(),
                pd.Timestamp("1899-12-31").date(),
                pd.Timestamp("1900-01-01").date(),
                pd.Timestamp("1899-12-31").date(),
                pd.Timestamp("1900-01-01").date(),
                pd.Timestamp("1900-01-01").date(),
            ]
            + [None] * 6
            + [
                pd.Timestamp("2023-7-21").date(),
                pd.Timestamp("2023-7-21").date(),
                # calamine converts any time to 1899-12-31, which is
                # questionable
                pd.Timestamp("1899-12-31").date(),
            ]
            + [None] * 4
            + [
                # calamine converts percentages to dates (since it does not
                # distinguish from floats), which seems questionable
                pd.Timestamp("1899-12-31").date()
            ],
        ),
        (
            "duration",
            [pd.NaT] * 14
            + [
                # dates/datetimes are converted to durations, which seems
                # questionable
                pd.Timedelta(datetime(2023, 7, 21 + 1) - datetime(1899, 12, 31)),
                pd.Timedelta(datetime(2023, 7, 21 + 1, 12, 20, 0) - datetime(1899, 12, 31)),
                pd.Timedelta(hours=12, minutes=20),
            ]
            + [pd.NaT] * 5,
        ),
    ],
)
def test_to_arrow_with_errors(
    dtype: fastexcel.DType,
    expected_data: list[Any],
):
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
    rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()

    pd_df = rb.to_pandas()
    # For string columns in pandas 3, replace pd.NA with None for comparison
    if dtype == "string":
        column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list()
    else:
        column_values = pd_df["Column"].replace(np.nan, None).to_list()
    assert column_values == expected_data

    def item_to_polars(item: Any):
        if isinstance(item, pd.Timestamp):
            return item.to_pydatetime()
        if pd.isna(item):
            return None
        return item

    pl_df = pl.from_arrow(rb)
    assert isinstance(pl_df, pl.DataFrame)
    pl_expected_data = list(map(item_to_polars, expected_data))
    assert pl_df["Column"].to_list() == pl_expected_data

    # the only empty cell is (0, 0), so all other cells that were read as None
    # should be errors
    expected_error_positions = [
        (i, 0) for i in range(1, len(expected_data)) if expected_data[i] in {None, pd.NaT}
    ]
    if expected_error_positions:
        assert cell_errors is not None
        error_positions = [err.offset_position for err in cell_errors.errors]
        assert error_positions == expected_error_positions


def test_guess_dtypes_with_div0_error() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("div0.xlsx"))
    sheet = excel_reader.load_sheet(0)
    assert sheet.available_columns() == [
        fastexcel.ColumnInfo(
            name="dividend",
            index=0,
            absolute_index=0,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="divisor",
            index=1,
            absolute_index=1,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="quotient",
            index=2,
            absolute_index=2,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
    ]
    expected_data = {
        "dividend": [42.0, 43.0, 44.0, 45.0],
        "divisor": [0.0, 1.0, 2.0, 3.0],
        "quotient": [None, 43.0, 22.0, 15.0],
    }

    pd_df = sheet.to_pandas()
    pd_expected_data = pd.DataFrame(expected_data)
    pd_assert_frame_equal(pd_df, pd_expected_data)

    pl_df = sheet.to_polars()
    pl_expected_data = pl.DataFrame(expected_data)
    pl_assert_frame_equal(pl_df, pl_expected_data)


================================================
FILE: python/tests/test_durations.py
================================================
from __future__ import annotations

from datetime import date, datetime, timedelta

import fastexcel
import numpy as np
import pandas as pd
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.datatypes import DataType as PolarsDataType
from polars.datatypes import Date as PlDate
from polars.datatypes import Datetime as PlDateTime
from polars.datatypes import Duration as PlDuration
from polars.datatypes import Utf8 as PlUtf8
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import get_expected_pandas_dtype, path_for_fixture


def test_sheet_with_different_time_types() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))
    sheet = excel_reader.load_sheet_by_idx(0)

    pd_df = sheet.to_pandas()
    pl_df = sheet.to_polars()

    ## dtypes
    assert pd_df["date"].dtype == np.dtype("object")
    assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string")
    assert pd_df["time"].dtype == np.dtype("timedelta64[ms]")
    assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]")
    expected_pl_dtypes: dict[str, PolarsDataType] = {
        "date": PlDate(),
        "datestr": PlUtf8(),
        "time": PlDuration(time_unit="ms"),
        "datetime": PlDateTime(time_unit="ms", time_zone=None),
    }
    assert dict(zip(pl_df.columns, pl_df.dtypes)) == expected_pl_dtypes

    ## Contents

    expected_pd = pd.DataFrame(
        {
            "date": [date(2023, 6, 1)],
            "datestr": ["2023-06-01T02:03:04+02:00"],
            "time": pd.Series([pd.to_timedelta("01:02:03")]).astype("timedelta64[ms]"),
            "datetime": pd.Series([pd.to_datetime("2023-06-01 02:03:04")]).astype("datetime64[ms]"),
        }
    )
    expected_pl = pl.DataFrame(
        {
            "date": [date(2023, 6, 1)],
            "datestr": ["2023-06-01T02:03:04+02:00"],
            "time": [timedelta(hours=1, minutes=2, seconds=3)],
            "datetime": [datetime(2023, 6, 1, 2, 3, 4)],
        },
        schema=expected_pl_dtypes,
    )
    pd_assert_frame_equal(pd_df, expected_pd)
    pl_assert_frame_equal(pl_df, expected_pl)


def test_sheet_with_offset_header_row_and_durations() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("single-sheet-skip-rows-durations.xlsx"))
    sheet = excel_reader.load_sheet(0, header_row=10)

    pd_df = sheet.to_pandas()
    pl_df = sheet.to_polars()

    assert pd_df["Tot. Time Away From System"].dtype == np.dtype("timedelta64[ms]")
    assert pd_df["Tot. Time Away From System"].tolist() == [
        pd.Timedelta("01:18:43"),
        pd.Timedelta("07:16:51"),
    ]

    assert pl_df["Tot. Time Away From System"].dtype == pl.Duration(time_unit="ms")
    assert pl_df["Tot. Time Away From System"].to_list() == [
        timedelta(hours=1, minutes=18, seconds=43),
        timedelta(hours=7, minutes=16, seconds=51),
    ]


================================================
FILE: python/tests/test_eagerness.py
================================================
from datetime import date, datetime, timedelta

import fastexcel
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from pyarrow import RecordBatch

from .utils import path_for_fixture


def test_load_sheet_eager_single_sheet() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))

    eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
    lazy_pandas = excel_reader.load_sheet(0).to_pandas()
    pd_assert_frame_equal(eager_pandas, lazy_pandas)

    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
    assert isinstance(eager_polars, pl.DataFrame)
    lazy_polars = excel_reader.load_sheet(0).to_polars()
    pl_assert_frame_equal(eager_polars, lazy_polars)


def test_multiple_sheets_with_unnamed_columns():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

    eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
    lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
    pd_assert_frame_equal(eager_pandas, lazy_pandas)

    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
    assert isinstance(eager_polars, pl.DataFrame)
    lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
    pl_assert_frame_equal(eager_polars, lazy_polars)


def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None:
    ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))

    record_batch = ods_reader.load_sheet_eager(0)
    assert isinstance(record_batch, RecordBatch)
    pl_df = pl.from_arrow(record_batch)
    assert isinstance(pl_df, pl.DataFrame)
    pl_assert_frame_equal(
        pl_df,
        pl.DataFrame(
            {
                "date": [date(2023, 6, 1)],
                "datestr": ["2023-06-01T02:03:04+02:00"],
                "time": [timedelta(hours=1, minutes=2, seconds=3)],
                "datetime": [datetime(2023, 6, 1, 2, 3, 4)],
            }
        ).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))),
    )


================================================
FILE: python/tests/test_empty.py
================================================
import fastexcel
import pytest

from .utils import path_for_fixture


@pytest.mark.parametrize("path", ("empty.ods", "empty.xlsx"))
def test_empty(path: str) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture(path))
    sheet = excel_reader.load_sheet_by_idx(0)

    assert sheet.to_pandas().empty
    assert sheet.to_polars().is_empty()


================================================
FILE: python/tests/test_errors.py
================================================
from __future__ import annotations

import fastexcel
import pytest

from .utils import path_for_fixture


def test_cell_error_repr() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
    _, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": "int"}).to_arrow_with_errors()
    assert cell_errors is not None
    assert (
        repr(cell_errors.errors[0])
        == """CellError(position=(2, 0), offset_position=(1, 0), row_offset=1, detail="Expected int but got 'String(\\"foo\\")'")"""  # noqa: E501
    )


def test_read_excel_bad_type() -> None:
    expected_message = "source must be a string or bytes"
    with pytest.raises(fastexcel.InvalidParametersError, match=expected_message):
        fastexcel.read_excel(42)  # type: ignore[arg-type]


def test_does_not_exist() -> None:
    expected_message = """calamine error: Cannot detect file format
Context:
    0: Could not open workbook at path_does_not_exist.nope
    1: could not load excel file at path_does_not_exist.nope"""

    with pytest.raises(fastexcel.CalamineError, match=expected_message) as exc_info:
        fastexcel.read_excel("path_does_not_exist.nope")

    assert exc_info.value.__doc__ == "Generic calamine error"

    # Should also work with the base error type
    with pytest.raises(fastexcel.FastExcelError, match=expected_message):
        fastexcel.read_excel("path_does_not_exist.nope")


def test_sheet_idx_not_found_error() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    expected_message = """sheet at index 42 not found
Context:
    0: Sheet index 42 is out of range. File has 1 sheets."""

    with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info:
        excel_reader.load_sheet(42)

    assert exc_info.value.__doc__ == "Sheet was not found"

    # Should also work with the base error type
    with pytest.raises(fastexcel.FastExcelError, match=expected_message):
        excel_reader.load_sheet(42)


def test_sheet_name_not_found_error() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    expected_message = """sheet with name "idontexist" not found
Context:
    0: Sheet "idontexist" not found in file. Available sheets: "January"."""

    with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info:
        excel_reader.load_sheet("idontexist")

    assert exc_info.value.__doc__ == "Sheet was not found"


@pytest.mark.parametrize(
    "exc_class, expected_docstring",
    [
        (fastexcel.FastExcelError, "The base class for all fastexcel errors"),
        (
            fastexcel.UnsupportedColumnTypeCombinationError,
            "Column contains an unsupported type combination",
        ),
        (fastexcel.CannotRetrieveCellDataError, "Data for a given cell cannot be retrieved"),
        (
            fastexcel.CalamineCellError,
            "calamine returned an error regarding the content of the cell",
        ),
        (fastexcel.CalamineError, "Generic calamine error"),
        (fastexcel.ColumnNotFoundError, "Column was not found"),
        (fastexcel.SheetNotFoundError, "Sheet was not found"),
        (fastexcel.ArrowError, "Generic arrow error"),
        (fastexcel.InvalidParametersError, "Provided parameters are invalid"),
    ],
)
def test_docstrings(exc_class: type[Exception], expected_docstring: str) -> None:
    assert exc_class.__doc__ == expected_docstring


def test_schema_sample_rows_must_be_nonzero() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))

    with pytest.raises(
        fastexcel.InvalidParametersError,
        match="schema_sample_rows cannot be 0, as it would prevent dtype inferring",
    ):
        excel_reader.load_sheet(0, schema_sample_rows=0)

    with pytest.raises(
        fastexcel.InvalidParametersError,
        match="schema_sample_rows cannot be 0, as it would prevent dtype inferring",
    ):
        excel_reader.load_table("my-table", schema_sample_rows=0)


================================================
FILE: python/tests/test_fastexcel.py
================================================
from __future__ import annotations

from datetime import datetime
from typing import Any

import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture


def test_single_sheet():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    assert excel_reader.sheet_names == ["January"]
    sheet_by_name = excel_reader.load_sheet("January")
    sheet_by_idx = excel_reader.load_sheet(0)

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "January"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 2

    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_single_sheet_bytes():
    with open(path_for_fixture("fixture-single-sheet.xlsx"), "rb") as f:
        excel_reader = fastexcel.read_excel(f.read())
    assert excel_reader.sheet_names == ["January"]
    sheet_by_name = excel_reader.load_sheet("January")
    sheet_by_idx = excel_reader.load_sheet(0)

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "January"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 2

    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_single_sheet_with_types():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    sheet = excel_reader.load_sheet(0)
    assert sheet.name == "Sheet1"
    assert sheet.height == sheet.total_height == 3
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "__UNNAMED__0": [0.0, 1.0, 2.0],
                "bools": [True, False, True],
                "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
                    "datetime64[ms]"
                ),
                "floats": [12.35, 42.69, 1234567],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "__UNNAMED__0": [0.0, 1.0, 2.0],
                "bools": [True, False, True],
                "dates": ["2022-03-02 05:43:04"] * 3,
                "floats": [12.35, 42.69, 1234567],
            }
        ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
    )


def test_multiple_sheets():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
    assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"]

    pd_assert_frame_equal(
        excel_reader.load_sheet_by_idx(0).to_pandas(),
        pd.DataFrame({"Month": [1.0], "Year": [2019.0]}),
    )
    pd_assert_frame_equal(
        excel_reader.load_sheet_by_idx(1).to_pandas(),
        pd.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}),
    )
    pd_assert_frame_equal(
        excel_reader.load_sheet_by_name("With unnamed columns").to_pandas(),
        pd.DataFrame(
            {
                "col1": [2.0, 3.0],
                "__UNNAMED__1": [1.5, 2.5],
                "col3": ["hello", "world"],
                "__UNNAMED__3": [-5.0, -6.0],
                "col5": ["a", "b"],
            }
        ),
    )

    pl_assert_frame_equal(
        excel_reader.load_sheet_by_idx(0).to_polars(),
        pl.DataFrame({"Month": [1.0], "Year": [2019.0]}),
    )
    pl_assert_frame_equal(
        excel_reader.load_sheet_by_idx(1).to_polars(),
        pl.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}),
    )
    pl_assert_frame_equal(
        excel_reader.load_sheet_by_name("With unnamed columns").to_polars(),
        pl.DataFrame(
            {
                "col1": [2.0, 3.0],
                "__UNNAMED__1": [1.5, 2.5],
                "col3": ["hello", "world"],
                "__UNNAMED__3": [-5.0, -6.0],
                "col5": ["a", "b"],
            }
        ),
    )


def test_sheets_with_header_line_diff_from_zero():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
    sheet_by_name = excel_reader.load_sheet("Sheet1", header_row=1)
    sheet_by_idx = excel_reader.load_sheet(0, header_row=1)

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "Sheet1"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 2

    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_sheets_with_no_header():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
    sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None)
    sheet_by_idx = excel_reader.load_sheet(1, header_row=None)

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 3

    expected = {
        "__UNNAMED__0": [1.0, 2.0],
        "__UNNAMED__1": [3.0, 4.0],
        "__UNNAMED__2": [5.0, 6.0],
    }

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_sheets_with_empty_rows_before_header():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
    sheet_by_name = excel_reader.load_sheet("Sheet3")
    sheet_by_idx = excel_reader.load_sheet(2)

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "Sheet3"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 2

    expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_sheets_with_custom_headers():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
    sheet_by_name = excel_reader.load_sheet(
        "Sheet2", header_row=None, column_names=["foo", "bar", "baz"]
    )
    sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["foo", "bar", "baz"])

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 3

    expected = {"foo": [1.0, 2.0], "bar": [3.0, 4.0], "baz": [5.0, 6.0]}

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_sheets_with_skipping_headers():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
    sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None, column_names=["Bugs"])
    sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["Bugs"])

    # Metadata
    assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
    assert sheet_by_name.height == sheet_by_idx.height == 2
    assert sheet_by_name.width == sheet_by_idx.width == 3

    expected = {
        "Bugs": [1.0, 2.0],
        "__UNNAMED__1": [3.0, 4.0],
        "__UNNAMED__2": [5.0, 6.0],
    }

    pd_expected = pd.DataFrame(expected)
    pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
    pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)

    pl_expected = pl.DataFrame(expected)
    pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
    pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)


def test_sheet_with_pagination():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    sheet = excel_reader.load_sheet(0, skip_rows=1, n_rows=1)
    assert sheet.name == "Sheet1"
    assert sheet.height == 1
    assert sheet.total_height == 3
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "__UNNAMED__0": [1.0],
                "bools": [False],
                "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"),
                "floats": [42.69],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "__UNNAMED__0": [1.0],
                "bools": [False],
                "dates": ["2022-03-02 05:43:04"],
                "floats": [42.69],
            }
        ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
    )


def test_sheet_with_skip_rows():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    sheet = excel_reader.load_sheet(0, skip_rows=1)
    assert sheet.name == "Sheet1"
    assert sheet.height == 2
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "__UNNAMED__0": [1.0, 2.0],
                "bools": [False, True],
                "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 2).astype(
                    "datetime64[ms]"
                ),
                "floats": [42.69, 1234567],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "__UNNAMED__0": [1.0, 2.0],
                "bools": [False, True],
                "dates": ["2022-03-02 05:43:04"] * 2,
                "floats": [42.69, 1234567],
            }
        ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
    )


def test_sheet_with_n_rows():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    sheet = excel_reader.load_sheet(0, n_rows=1)
    assert sheet.name == "Sheet1"
    assert sheet.height == 1
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "__UNNAMED__0": [0.0],
                "bools": [True],
                "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"),
                "floats": [12.35],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "__UNNAMED__0": [0.0],
                "bools": [True],
                "dates": ["2022-03-02 05:43:04"],
                "floats": [12.35],
            }
        ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
    )


def test_sheet_with_pagination_and_without_headers():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    sheet = excel_reader.load_sheet(
        0,
        n_rows=1,
        skip_rows=1,
        header_row=None,
        column_names=["This", "Is", "Amazing", "Stuff"],
    )
    assert sheet.name == "Sheet1"
    assert sheet.height == 1
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "This": [0.0],
                "Is": [True],
                "Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype(
                    "datetime64[ms]"
                ),
                "Stuff": [12.35],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "This": [0.0],
                "Is": [True],
                "Amazing": ["2022-03-02 05:43:04"],
                "Stuff": [12.35],
            }
        ).with_columns(
            pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
        ),
    )


def test_sheet_with_pagination_out_of_bound():
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
    assert excel_reader.sheet_names == ["Sheet1"]

    with pytest.raises(
        fastexcel.InvalidParametersError, match="Too many rows skipped. Max height is 4"
    ):
        excel_reader.load_sheet(
            0,
            skip_rows=1000000,
            header_row=None,
            column_names=["This", "Is", "Amazing", "Stuff"],
        )

    sheet = excel_reader.load_sheet(
        0,
        n_rows=1000000,
        skip_rows=1,
        header_row=None,
        column_names=["This", "Is", "Amazing", "Stuff"],
    )
    assert sheet.name == "Sheet1"
    assert sheet.height == 3
    assert sheet.width == 4

    pd_assert_frame_equal(
        sheet.to_pandas(),
        pd.DataFrame(
            {
                "This": [0.0, 1.0, 2.0],
                "Is": [True, False, True],
                "Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
                    "datetime64[ms]"
                ),
                "Stuff": [12.35, 42.69, 1234567],
            }
        ),
    )

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(
            {
                "This": [0.0, 1.0, 2.0],
                "Is": [True, False, True],
                "Amazing": ["2022-03-02 05:43:04"] * 3,
                "Stuff": [12.35, 42.69, 1234567],
            }
        ).with_columns(
            pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
        ),
    )


def test_sheet_with_na():
    """Test reading a sheet with #N/A cells. For now, we consider them as null"""
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx"))
    sheet = excel_reader.load_sheet(0)

    assert sheet.name == "Sheet1"
    assert sheet.height == sheet.total_height == 2
    assert sheet.width == 2

    expected = {
        "Title": ["A", "B"],
        "Amount": [None, 100.0],
    }
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_sheet_with_ref():
    """Test reading a sheet with #REF! cells. For now, we consider them as null"""
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx"))
    sheet = excel_reader.load_sheet("Broken refs")

    assert sheet.name == "Broken refs"
    assert sheet.height == sheet.total_height == 2
    assert sheet.width == 1

    expected = {"numbers": [1.0, None]}
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_null_strings(excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]):
    excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
    sheet = excel_reader.load_sheet(0)

    assert sheet.height == sheet.total_height == 10
    assert sheet.width == 6

    pd_df = pd.DataFrame(expected_data_sheet_null_strings)
    pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
    pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
    pd_assert_frame_equal(sheet.to_pandas(), pd_df)

    pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
        pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
        pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
    )
    pl_assert_frame_equal(sheet.to_polars(), pl_df)


def test_null_values_in_cells() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value.xlsx"))
    sheet = excel_reader.load_sheet(0)

    expected = {
        "Title": ["A", "B", "C", "D"],
        "Date": [None, None, datetime(2021, 1, 1), datetime(2021, 5, 5)],
    }

    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame(expected).with_columns(pl.col("Date").dt.cast_time_unit("ms")),
    )
    pd_expected = pd.DataFrame(expected)
    pd_expected["Date"] = pd_expected["Date"].dt.as_unit("ms")
    pd_assert_frame_equal(sheet.to_pandas(), pd_expected)


def test_invalid_value_num() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value-num.xlsx"))
    sheet = excel_reader.load_sheet(0)

    expected = {"Column": [8.0, None]}
    pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
    pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))


def test_null_column_is_nullable() -> None:
    sheet = fastexcel.read_excel(path_for_fixture("null-column.xlsx")).load_sheet(0)
    assert sheet.to_arrow().schema.field("nullonly").nullable is True


def test_sheet_with_decimal_numbers() -> None:
    sheet = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet(0)
    pl_assert_frame_equal(
        sheet.to_polars(),
        pl.DataFrame({"Decimals": [28.14, 29.02]}),
    )

    sheet2 = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet(
        0, dtypes={0: "string"}
    )
    pl_assert_frame_equal(
        sheet2.to_polars(),
        pl.DataFrame({"Decimals": ["28.14", "29.02"]}),
    )


@pytest.mark.parametrize(
    "header_row, skip_rows, expected",
    [
        (0, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),  # default
        (
            None,
            0,
            {
                "__UNNAMED__0": [None, None, "a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [None, None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (
            None,
            None,
            {
                "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (
            0,
            0,
            {
                "__UNNAMED__0": [None, "a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (
            0,
            1,
            {
                "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (
            None,
            2,
            {
                "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (
            None,
            3,
            {"__UNNAMED__0": ["b", "c", "d", "e", "f"], "__UNNAMED__1": [1.0, 2.0, 3.0, 4.0, 5.0]},
        ),
        (
            1,
            0,
            {
                "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
                "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
            },
        ),
        (2, 0, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
        (2, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
        (2, 1, {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
        (2, [1, 3], {"a": ["b", "d", "f"], "0": [1.0, 3.0, 5.0]}),
        (2, [0], {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
        (
            None,
            [2, 4],
            {
                "__UNNAMED__0": [None, None, "b", "d", "e", "f"],
                "__UNNAMED__1": [None, None, 1.0, 3.0, 4.0, 5.0],
            },
        ),
        (2, [], {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
        (2, [0, 1, 2, 3], {"a": ["f"], "0": [5.0]}),
        (2, lambda x: x % 2 == 0, {"a": ["c", "e"], "0": [2.0, 4.0]}),
        (2, lambda x: x in [0, 4], {"a": ["c", "d", "e"], "0": [2.0, 3.0, 4.0]}),
        (2, lambda x: False, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
        (2, lambda x: x != 2, {"a": ["d"], "0": [3.0]}),
    ],
)
def test_header_row_and_skip_rows(
    header_row: int | None, skip_rows: int, expected: dict[str, Any]
) -> None:
    pl_assert_frame_equal(
        fastexcel.read_excel(path_for_fixture("no-header.xlsx"))
        .load_sheet(0, header_row=header_row, skip_rows=skip_rows)
        .to_polars(),
        pl.DataFrame(expected),
    )


def test_null_bytes_in_column_names() -> None:
    """https://github.com/ToucanToco/fastexcel/issues/343"""
    reader = fastexcel.read_excel(path_for_fixture("null-bytes-in-columns-names.xls"))
    df = reader.load_sheet(0).to_polars()
    assert df.shape == (8_763, 11)


================================================
FILE: python/tests/test_pycapsule.py
================================================
"""Tests for the Arrow PyCapsule Interface implementation."""

import fastexcel
import pandas as pd
import polars as pl

from .utils import path_for_fixture


def test_sheet_arrow_c_schema():
    """Test that __arrow_c_schema__ returns a valid PyCapsule."""
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    schema_capsule = sheet.__arrow_c_schema__()

    # Check it's a PyCapsule with the correct name
    assert hasattr(schema_capsule, "__class__")
    assert "PyCapsule" in str(type(schema_capsule))


def test_sheet_arrow_c_array():
    """Test that __arrow_c_array__ returns a tuple of PyCapsules."""
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    schema_capsule, array_capsule = sheet.__arrow_c_array__()

    # Check both are PyCapsules
    assert "PyCapsule" in str(type(schema_capsule))
    assert "PyCapsule" in str(type(array_capsule))


def test_table_arrow_c_schema():
    """Test that table __arrow_c_schema__ returns a valid PyCapsule."""
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
    table_names = excel_reader.table_names()

    table = excel_reader.load_table(table_names[0])  # Should be 'users'
    schema_capsule = table.__arrow_c_schema__()

    # Check it's a PyCapsule
    assert "PyCapsule" in str(type(schema_capsule))


def test_table_arrow_c_array():
    """Test that table __arrow_c_array__ returns a tuple of PyCapsules."""
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
    table_names = excel_reader.table_names()

    table = excel_reader.load_table(table_names[0])  # Should be 'users'
    schema_capsule, array_capsule = table.__arrow_c_array__()

    # Check both are PyCapsules
    assert "PyCapsule" in str(type(schema_capsule))
    assert "PyCapsule" in str(type(array_capsule))


def test_pycapsule_interface_with_requested_schema():
    """Test PyCapsule interface methods with requested_schema parameter."""
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    # Test with None (current implementation ignores this)
    schema_capsule, array_capsule = sheet.__arrow_c_array__(None)

    assert "PyCapsule" in str(type(schema_capsule))
    assert "PyCapsule" in str(type(array_capsule))


def test_integration_with_polars():
    """Test that polars can consume our PyCapsule interface."""
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    # Polars should be able to create a DataFrame from our PyCapsule interface
    # This tests the actual interoperability
    df = pl.DataFrame(sheet)

    assert len(df) == 2
    assert df.columns == ["Month", "Year"]


def test_to_polars_without_pyarrow():
    """Test that to_polars() works via PyCapsule interface without pyarrow."""
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    # This should work via PyCapsule interface, not requiring pyarrow
    df = sheet.to_polars()

    assert isinstance(df, pl.DataFrame)
    assert len(df) == 2
    assert df.columns == ["Month", "Year"]

    # Test with table as well
    excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
    table_names = excel_reader_table.table_names()
    table = excel_reader_table.load_table(table_names[0])
    df_table = table.to_polars()
    assert isinstance(df_table, pl.DataFrame)


def test_to_pandas_still_requires_pyarrow():
    """Test that to_pandas() currently still requires pyarrow.

    Note: pandas PyCapsule interface would require implementing __dataframe__
    or __arrow_c_stream__, which we don't currently do.
    """
    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
    sheet = excel_reader.load_sheet("January")

    # This still requires pyarrow for now
    df = sheet.to_pandas()

    assert isinstance(df, pd.DataFrame)
    assert len(df) == 2
    assert list(df.columns) == ["Month", "Year"]

    # Test with table as well
    excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
    table_names = excel_reader_table.table_names()
    table = excel_reader_table.load_table(table_names[0])
    df_table = table.to_pandas()
    assert isinstance(df_table, pd.DataFrame)


================================================
FILE: python/tests/test_sheet_visibility.py
================================================
import fastexcel

from .utils import path_for_fixture


def test_sheet_visibilities() -> None:
    file_path = path_for_fixture("fixture-sheets-different-visibilities.xlsx")

    reader = fastexcel.read_excel(file_path)

    assert reader.load_sheet(0).visible == "visible"
    assert reader.load_sheet(1).visible == "hidden"
    assert reader.load_sheet(2).visible == "veryhidden"


================================================
FILE: python/tests/test_shifted_data.py
================================================
import fastexcel

from .utils import path_for_fixture


def test_sheet_with_offset():
    reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
    sheet = reader.load_sheet("without-table")

    assert sheet.available_columns() == [
        fastexcel.ColumnInfo(
            name="Column at H10",
            index=0,
            absolute_index=7,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="Column at I10",
            index=1,
            absolute_index=8,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
        fastexcel.ColumnInfo(
            name="__UNNAMED__2",
            index=2,
            absolute_index=9,
            dtype="string",
            dtype_from="guessed",
            column_name_from="generated",
        ),
        fastexcel.ColumnInfo(
            name="Column at K10",
            index=3,
            absolute_index=10,
            dtype="float",
            dtype_from="guessed",
            column_name_from="looked_up",
        ),
    ]


def test_table_with_offset():
    reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
    table = reader.load_table("TableAtD5")

    assert table.available_columns() == [
        fastexcel.ColumnInfo(
            name="Column at D5",
            index=0,
            absolute_index=3,
            dtype="float",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="Column at E5",
            index=1,
            absolute_index=4,
            dtype="float",
            dtype_from="guessed",
            column_name_from="provided",
        ),
    ]


================================================
FILE: python/tests/test_tables.py
================================================
from datetime import datetime

import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture


@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_table_names(path: str) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture(path))
    table_names = excel_reader.table_names()

    assert table_names == ["users"]


@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_table_names_with_sheet_name(path: str) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture(path))
    table_names = excel_reader.table_names("sheet1")

    assert table_names == ["users"]

    table_names = excel_reader.table_names("sheet2")

    assert table_names == []


@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_load_table(path: str) -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture(path))
    users_tbl = excel_reader.load_table("users")

    assert users_tbl.name == "users"
    assert users_tbl.sheet_name == "sheet1"
    assert users_tbl.specified_dtypes is None
    assert users_tbl.available_columns() == [
        fastexcel.ColumnInfo(
            name="User Id",
            index=0,
            absolute_index=0,
            dtype="float",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="FirstName",
            index=1,
            absolute_index=1,
            dtype="string",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="LastName",
            index=2,
            absolute_index=2,
            dtype="string",
            dtype_from="guessed",
            column_name_from="provided",
        ),
        fastexcel.ColumnInfo(
            name="Date",
            index=3,
            absolute_index=3,
            dtype="datetime",
            dtype_from="guessed",
            column_name_from="provided",
        ),
    ]
    assert users_tbl.total_height == 3
    assert users_tbl.offset == 0
    assert users_tbl.height == 3
    assert users_tbl.width == 4

    expected_pl = pl.DataFrame(
        {
            "User Id": [1.0, 2.0, 5.0],
            "FirstName": ["Peter", "John", "Hans"],
            "LastName": ["Müller", "Meier", "Fricker"],
            "Date": [datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)],
        }
    ).with_columns(pl.col("Date").dt.cast_time_unit("ms"))
    pl_assert_frame_equal(users_tbl.to_polars(), expected_pl)

    expected_pd = pd.DataFrame(
        {
            "User Id": [1.0, 2.0, 5.0],
            "FirstName": ["Peter", "John", "Hans"],
            "LastName": ["Müller", "Meier", "Fricker"],
            "Date": pd.Series(
                [datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)]
            ).astype("datetime64[ms]"),
        }
    )

    pd_assert_frame_equal(users_tbl.to_pandas(), expected_pd)

    table_eager = excel_reader.load_table("users", eager=True)
    pl_df = pl.from_arrow(table_eager)
    assert isinstance(pl_df, pl.DataFrame)
    pl_assert_frame_equal(pl_df, expected_pl)
    pd_assert_frame_equal(table_eager.to_pandas(), expected_pd)


================================================
FILE: python/tests/test_whitespace.py
================================================
import datetime

import fastexcel
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal

from .utils import path_for_fixture


def test_skip_tail_whitespace_rows() -> None:
    """Test that skip_whitespace_tail_rows option works correctly."""
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))

    # Expected data when NOT skipping whitespace tail rows
    expected_with_whitespace = pl.DataFrame(
        {
            "Column One": ["1", "2", "3", None, "5", None, None, None, None, " "],
            "Column Two": ["one", "two", None, "four", "five", None, None, "", None, None],
            "Column Three": [
                datetime.datetime(2025, 11, 19, 14, 34, 2),
                datetime.datetime(2025, 11, 20, 14, 56, 34),
                datetime.datetime(2025, 11, 21, 15, 19, 6),
                None,
                datetime.datetime(2025, 11, 22, 15, 41, 38),
                datetime.datetime(2025, 11, 23, 16, 4, 10),
                None,
                None,
                None,
                None,
            ],
        }
    ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))

    # Expected data when skipping whitespace tail rows
    expected_without_whitespace = pl.DataFrame(
        {
            "Column One": [1.0, 2.0, 3.0, None, 5.0, None],
            "Column Two": ["one", "two", None, "four", "five", None],
            "Column Three": [
                datetime.datetime(2025, 11, 19, 14, 34, 2),
                datetime.datetime(2025, 11, 20, 14, 56, 34),
                datetime.datetime(2025, 11, 21, 15, 19, 6),
                None,
                datetime.datetime(2025, 11, 22, 15, 41, 38),
                datetime.datetime(2025, 11, 23, 16, 4, 10),
            ],
        }
    ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))

    # Test sheet without skipping whitespace tail rows
    sheet_with_whitespace = excel_reader.load_sheet("Without Table")
    pl_assert_frame_equal(sheet_with_whitespace.to_polars(), expected_with_whitespace)

    # Test table without skipping whitespace tail rows
    table_with_whitespace = excel_reader.load_table("Table_with_whitespace")
    pl_assert_frame_equal(table_with_whitespace.to_polars(), expected_with_whitespace)

    # Test sheet with skipping whitespace tail rows
    sheet_without_whitespace = excel_reader.load_sheet(
        "Without Table", skip_whitespace_tail_rows=True
    )
    pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)

    # Test table with skipping whitespace tail rows
    table_without_whitespace = excel_reader.load_table(
        "Table_with_whitespace", skip_whitespace_tail_rows=True
    )
    pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)

    # Also verify pandas compatibility
    pd_assert_frame_equal(
        sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
    )
    pd_assert_frame_equal(
        table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
    )


def test_skip_tail_rows_and_whitespace_as_null_behavior() -> None:
    excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))

    # Expected data when converting whitespace to null but not skipping tail rows
    expected_with_whitespace_as_null = pl.DataFrame(
        {
            # All rows should be taken into account but the space in the last row should be
            # considered null
            "Column One": [1.0, 2.0, 3.0, None, 5.0, None, None, None, None, None],
            # All rows should be taken into account but the empty string in 8th row should be
            # considered null
            "Column Two": ["one", "two", None, "four", "five", None, None, None, None, None],
            "Column Three": [
                datetime.datetime(2025, 11, 19, 14, 34, 2),
                datetime.datetime(2025, 11, 20, 14, 56, 34),
                datetime.datetime(2025, 11, 21, 15, 19, 6),
                None,
                datetime.datetime(2025, 11, 22, 15, 41, 38),
                datetime.datetime(2025, 11, 23, 16, 4, 10),
                None,
                None,
                None,
                None,
            ],
        }
    ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))

    # Expected data when converting whitespace to null and skipping tail rows
    expected_without_whitespace = pl.DataFrame(
        {
            "Column One": [1.0, 2.0, 3.0, None, 5.0, None],
            "Column Two": ["one", "two", None, "four", "five", None],
            "Column Three": [
                datetime.datetime(2025, 11, 19, 14, 34, 2),
                datetime.datetime(2025, 11, 20, 14, 56, 34),
                datetime.datetime(2025, 11, 21, 15, 19, 6),
                None,
                datetime.datetime(2025, 11, 22, 15, 41, 38),
                datetime.datetime(2025, 11, 23, 16, 4, 10),
            ],
        }
    ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))

    # Test sheet with whitespace_as_null but not skipping tail rows
    sheet_with_whitespace_as_null = excel_reader.load_sheet(
        "Without Table", whitespace_as_null=True
    )
    pl_assert_frame_equal(
        sheet_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
    )

    # Test table with whitespace_as_null but not skipping tail rows
    table_with_whitespace_as_null = excel_reader.load_table(
        "Table_with_whitespace", whitespace_as_null=True
    )
    pl_assert_frame_equal(
        table_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
    )

    # Test sheet with both whitespace_as_null and skip_whitespace_tail_rows
    sheet_without_whitespace = excel_reader.load_sheet(
        "Without Table", whitespace_as_null=True, skip_whitespace_tail_rows=True
    )
    pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)

    # Test table with both whitespace_as_null and skip_whitespace_tail_rows
    table_without_whitespace = excel_reader.load_table(
        "Table_with_whitespace", whitespace_as_null=True, skip_whitespace_tail_rows=True
    )
    pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)

    # Also verify pandas compatibility
    pd_assert_frame_equal(
        sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
    )
    pd_assert_frame_equal(
        sheet_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
    )
    pd_assert_frame_equal(
        table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
    )
    pd_assert_frame_equal(
        table_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
    )


================================================
FILE: python/tests/utils.py
================================================
from __future__ import annotations

from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd


def path_for_fixture(fixture_file: str) -> str:
    return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file)


def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any:
    """Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version.

    In pandas < 3.0, string columns use object dtype.
    In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow).
    """
    pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])

    dtype_map = {
        "int": np.dtype("int64"),
        "float": np.dtype("float64"),
        "boolean": np.dtype("bool"),
        "datetime": np.dtype("datetime64[ms]"),
        "duration": np.dtype("timedelta64[ms]"),
    }

    if fastexcel_dtype in dtype_map:
        return dtype_map[fastexcel_dtype]

    if fastexcel_dtype == "string":
        if pd_version >= (3, 0):
            # When converting from Arrow, pandas uses nan as na_value
            return pd.StringDtype(na_value=np.nan)
        else:
            return np.dtype("object")

    if fastexcel_dtype == "date":
        # Date columns are always object dtype
        return np.dtype("object")

    raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}")


def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None:
    """Assert that a pandas DataFrame has the expected dtypes for each column.

    Args:
        df: The pandas DataFrame to check
        expected_dtypes: A dict mapping column names to fastexcel dtype strings
    """
    for col_name, fastexcel_dtype in expected_dtypes.items():
        expected_dtype = get_expected_pandas_dtype(fastexcel_dtype)
        actual_dtype = df[col_name].dtype
        assert actual_dtype == expected_dtype, (
            f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}"
        )


================================================
FILE: scripts/update_versions.py
================================================
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.9"
# dependencies = []
# ///
"""Manage docs/versions.json and generate the root docs/index.html redirect."""

from __future__ import annotations

import argparse
import json
import re
from pathlib import Path


def parse_semver(version: str) -> tuple[int, ...]:
    """Extract numeric parts from a version string like 'v0.19.0'."""
    return tuple(int(x) for x in re.findall(r"\d+", version))


def sort_versions(versions: list[dict]) -> list[dict]:
    """Sort: stable first, then tags descending by semver, 'latest' last."""
    def sort_key(v: dict) -> tuple[int, tuple[int, ...], str]:
        path = v["path"]
        if v.get("stable"):
            return (0, (), "")
        if path == "latest":
            return (2, (), "")
        return (1, tuple(-x for x in parse_semver(path)), path)

    return sorted(versions, key=sort_key)


def update_versions(docs_dir: Path, version: str, *, stable: bool) -> None:
    if not re.fullmatch(r"latest|v\d+(\.\d+)*", version):
        raise ValueError(f"Invalid version '{version}': must be 'latest' or match 'v<semver>' (e.g. v0.19.0)")

    versions_file = docs_dir / "versions.json"

    if versions_file.exists():
        versions = json.loads(versions_file.read_text())
    else:
        versions = []

    # Build label
    if version == "latest":
        label = "latest (main)"
    elif stable:
        label = f"{version} (stable)"
    else:
        label = version

    # Remove old entry for this version, and clear stable flag from others if
    # this one is now stable
    new_versions = []
    for v in versions:
        if v["path"] == version:
            continue
        if stable and v.get("stable"):
            v = {**v, "stable": False, "label": v["path"]}
        new_versions.append(v)

    new_versions.append({"label": label, "path": version, "stable": stable})
    new_versions = sort_versions(new_versions)

    versions_file.write_text(json.dumps(new_versions, indent=2) + "\n")

    # Generate root redirect
    stable_entry = next((v for v in new_versions if v.get("stable")), None)
    redirect_path = stable_entry["path"] if stable_entry else version
    index_html = docs_dir / "index.html"
    index_html.write_text(
        f"""\
<!doctype html>
<html>
<head>
    <meta charset="utf-8">
    <meta http-equiv="refresh" content="0; url=./{redirect_path}/fastexcel.html"/>
</head>
<body>
    <p>Redirecting to <a href="./{redirect_path}/fastexcel.html">{redirect_path} documentation</a>...</p>
</body>
</html>
"""
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Update docs versions.json")
    parser.add_argument("--version", required=True, help="Version name (e.g. v0.19.0 or latest)")
    parser.add_argument("--stable", action="store_true", help="Mark this version as the stable default")
    parser.add_argument("--docs-dir", default="docs", help="Path to the docs directory")
    args = parser.parse_args()

    update_versions(Path(args.docs_dir), args.version, stable=args.stable)


if __name__ == "__main__":
    main()


================================================
FILE: src/data/cell_extractors.rs
================================================
use calamine::{CellType, DataType};
use chrono::{NaiveDate, NaiveDateTime, TimeDelta};

use crate::types::dtype::excel_float_to_string;

pub(super) fn extract_boolean<DT: CellType + DataType>(cell: &DT) -> Option<bool> {
    if let Some(b) = cell.get_bool() {
        Some(b)
    } else if let Some(i) = cell.get_int() {
        Some(i != 0)
    }
    // clippy formats else if let Some(blah) = ... { Some(x) } else { None } to the .map form
    else {
        cell.get_float().map(|f| f != 0.0)
    }
}

pub(super) fn extract_int<DT: CellType + DataType>(cell: &DT) -> Option<i64> {
    cell.as_i64()
}

pub(super) fn extract_float<DT: CellType + DataType>(cell: &DT) -> Option<f64> {
    cell.as_f64()
}

pub(super) fn extract_string<DT: CellType + DataType>(cell: &DT) -> Option<String> {
    if cell.is_string() {
        cell.get_string().map(str::to_string)
    } else if cell.is_datetime() {
        cell.get_datetime()
            .and_then(|dt| dt.as_datetime())
            .map(|dt| dt.to_string())
    } else if cell.is_datetime_iso() {
        cell.get_datetime_iso().map(str::to_string)
    } else if cell.is_bool() {
        cell.get_bool().map(|v| v.to_string())
    } else if cell.is_float() {
        cell.get_float().map(excel_float_to_string)
    } else {
        cell.as_string()
    }
}

pub(super) fn extract_date<DT: CellType + DataType>(cell: &DT) -> Option<NaiveDate> {
    cell.as_date()
}

#[cfg(feature = "python")]
const EPOCH: NaiveDate = NaiveDate::from_ymd_opt(1970, 1, 1).expect("Failed to create EPOCH");

#[cfg(feature = "python")]
pub(super) fn extract_date_as_num_days<DT: CellType + DataType>(cell: &DT) -> Option<i32> {
    extract_date(cell)
        .and_then(|date| i32::try_from(date.signed_duration_since(EPOCH).num_days()).ok())
}

pub(super) fn extract_datetime<DT: CellType + DataType>(cell: &DT) -> Option<NaiveDateTime> {
    cell.as_datetime()
}

#[cfg(feature = "python")]
pub(super) fn extract_datetime_as_timestamp_ms<DT: CellType + DataType>(cell: &DT) -> Option<i64> {
    extract_datetime(cell).map(|dt| dt.and_utc().timestamp_millis())
}

pub(super) fn extract_duration<DT: CellType + DataType>(cell: &DT) -> Option<TimeDelta> {
    cell.as_duration()
}

#[cfg(feature = "python")]
pub(super) fn extract_duration_as_ms<DT: CellType + DataType>(cell: &DT) -> Option<i64> {
    extract_duration(cell).map(|d| d.num_milliseconds())
}


================================================
FILE: src/data/mod.rs
================================================
mod cell_extractors;
#[cfg(feature = "python")]
mod python;
mod rust;
use chrono::{Duration, NaiveDate, NaiveDateTime};
#[cfg(feature = "python")]
pub(crate) use python::*;

use calamine::{CellType, Data as CalData, DataRef as CalDataRef, DataType, Range};

use crate::{
    data::rust::{
        create_boolean_vec, create_date_vec, create_datetime_vec, create_duration_vec,
        create_float_vec, create_int_vec, create_string_vec,
    },
    error::{FastExcelErrorKind, FastExcelResult},
    types::{
        dtype::{DType, DTypeCoercion, get_dtype_for_column},
        excelsheet::{SkipRows, column_info::ColumnInfo},
    },
};

#[derive(Debug)]
pub(crate) enum ExcelSheetData<'r> {
    Owned(Range<CalData>),
    Ref(Range<CalDataRef<'r>>),
}

impl ExcelSheetData<'_> {
    pub(crate) fn width(&self) -> usize {
        match self {
            ExcelSheetData::Owned(range) => range.width(),
            ExcelSheetData::Ref(range) => range.width(),
        }
    }

    pub(crate) fn height(&self) -> usize {
        match self {
            ExcelSheetData::Owned(range) => range.height(),
            ExcelSheetData::Ref(range) => range.height(),
        }
    }

    pub(super) fn get_as_string(&self, pos: (usize, usize)) -> Option<String> {
        match self {
            ExcelSheetData::Owned(range) => range.get(pos).and_then(|data| data.as_string()),
            ExcelSheetData::Ref(range) => range.get(pos).and_then(|data| data.as_string()),
        }
    }

    pub(crate) fn dtype_for_column(
        &self,
        start_row: usize,
        end_row: usize,
        col: usize,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<DType> {
        match self {
            ExcelSheetData::Owned(data) => get_dtype_for_column(
                data,
                start_row,
                end_row,
                col,
                dtype_coercion,
                whitespace_as_null,
            ),
            ExcelSheetData::Ref(data) => get_dtype_for_column(
                data,
                start_row,
                end_row,
                col,
                dtype_coercion,
                whitespace_as_null,
            ),
        }
    }

    pub(crate) fn height_without_tail_whitespace(&self) -> usize {
        match self {
            ExcelSheetData::Owned(data) => {
                height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
            }
            ExcelSheetData::Ref(data) => {
                height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
            }
        }
    }

    pub(crate) fn start(&self) -> Option<(usize, usize)> {
        let start = match self {
            ExcelSheetData::Owned(range) => range.start(),
            ExcelSheetData::Ref(range) => range.start(),
        };
        start.map(|(r, c)| (r as usize, c as usize))
    }
}

impl From<Range<CalData>> for ExcelSheetData<'_> {
    fn from(range: Range<CalData>) -> Self {
        Self::Owned(range)
    }
}

impl<'a> From<Range<CalDataRef<'a>>> for ExcelSheetData<'a> {
    fn from(range: Range<CalDataRef<'a>>) -> Self {
        Self::Ref(range)
    }
}

trait CellIsWhiteSpace {
    fn is_whitespace(&self) -> bool;
}

impl<T> CellIsWhiteSpace for T
where
    T: DataType,
{
    fn is_whitespace(&self) -> bool {
        if self.is_empty() {
            true
        } else if self.is_string()
            && let Some(s) = self.get_string()
        {
            s.trim().is_empty()
        } else {
            false
        }
    }
}

pub(crate) fn height_without_tail_whitespace<CT: CellType + DataType + std::fmt::Debug>(
    data: &Range<CT>,
) -> Option<usize> {
    let height = data.height();
    let width = data.width();
    if height < 1 {
        return Some(0);
    }
    if width < 1 {
        return None;
    }
    (0..width)
        .map(|col_idx| {
            let mut row_idx = height - 1;
            // Start at the bottom of the column and work upwards until we find a non-empty cell
            while row_idx > 0
                && data
                    .get((row_idx, col_idx))
                    .map(CellIsWhiteSpace::is_whitespace)
                    .unwrap_or(true)
            {
                row_idx -= 1;
            }
            row_idx + 1
        })
        .max()
}

/// A container for a typed vector of values. Used to represent a column of data in an Excel sheet.
/// These should only be used when you need to work on the raw data. Otherwise, you should use a
/// `FastExcelColumn`.
#[derive(Debug, Clone, PartialEq)]
pub enum FastExcelSeries {
    Null,
    Bool(Vec<Option<bool>>),
    String(Vec<Option<String>>),
    Int(Vec<Option<i64>>),
    Float(Vec<Option<f64>>),
    Datetime(Vec<Option<NaiveDateTime>>),
    Date(Vec<Option<NaiveDate>>),
    Duration(Vec<Option<Duration>>),
}

impl FastExcelSeries {
    pub fn dtype(&self) -> DType {
        match self {
            FastExcelSeries::Null => DType::Null,
            FastExcelSeries::Bool(_) => DType::Bool,
            FastExcelSeries::String(_) => DType::String,
            FastExcelSeries::Int(_) => DType::Int,
            FastExcelSeries::Float(_) => DType::Float,
            FastExcelSeries::Datetime(_) => DType::DateTime,
            FastExcelSeries::Date(_) => DType::Date,
            FastExcelSeries::Duration(_) => DType::Duration,
        }
    }

    pub fn is_null(&self) -> bool {
        matches!(self, FastExcelSeries::Null)
    }
}

macro_rules! impl_series_variant {
    ($type:ty, $variant:ident, $into_fn:ident) => {
        impl From<Vec<Option<$type>>> for FastExcelSeries {
            fn from(vec: Vec<Option<$type>>) -> Self {
                Self::$variant(vec)
            }
        }

        impl<const N: usize> From<[Option<$type>; N]> for FastExcelSeries {
            fn from(arr: [Option<$type>; N]) -> Self {
                Self::$variant(arr.to_vec())
            }
        }

        impl<const N: usize> From<[$type; N]> for FastExcelSeries {
            fn from(arr: [$type; N]) -> Self {
                Self::$variant(arr.into_iter().map(Some).collect())
            }
        }

        impl From<&[$type]> for FastExcelSeries {
            fn from(arr: &[$type]) -> Self {
                Self::$variant(arr.into_iter().map(|it| Some(it.to_owned())).collect())
            }
        }

        impl From<&[Option<$type>]> for FastExcelSeries {
            fn from(arr: &[Option<$type>]) -> Self {
                Self::$variant(arr.into_iter().map(ToOwned::to_owned).collect())
            }
        }

        // Not implementing is_empty here, because we have no len information for null Series
        impl FastExcelSeries {
            pub fn $into_fn(self) -> FastExcelResult<Vec<Option<$type>>> {
                if let Self::$variant(vec) = self {
                    Ok(vec)
                } else {
                    Err(FastExcelErrorKind::InvalidParameters(format!(
                        "{self:?} cannot be converted to {type_name}",
                        type_name = std::any::type_name::<$type>()
                    ))
                    .into())
                }
            }
        }
    };
}

impl_series_variant!(bool, Bool, into_bools);
impl_series_variant!(String, String, into_strings);
impl_series_variant!(i64, Int, into_ints);
impl_series_variant!(f64, Float, into_floats);
impl_series_variant!(NaiveDateTime, Datetime, into_datetimes);
impl_series_variant!(NaiveDate, Date, into_dates);
impl_series_variant!(Duration, Duration, into_durations);

// Conflicting impls when using `From<AsRef<[&str]>>`
impl<const N: usize> From<[Option<&str>; N]> for FastExcelSeries {
    fn from(arr: [Option<&str>; N]) -> Self {
        Self::String(arr.into_iter().map(|s| s.map(|s| s.to_string())).collect())
    }
}

impl<const N: usize> From<[&str; N]> for FastExcelSeries {
    fn from(arr: [&str; N]) -> Self {
        Self::String(arr.into_iter().map(|s| Some(s.to_string())).collect())
    }
}

/// A column in a sheet or table. A wrapper around a `FastExcelSeries` and a name.
#[derive(Debug, Clone, PartialEq)]
pub struct FastExcelColumn {
    pub name: String,
    pub(crate) data: FastExcelSeries,
    len: usize,
}

impl FastExcelColumn {
    pub fn try_new(
        name: String,
        data: FastExcelSeries,
        len: Option<usize>,
    ) -> FastExcelResult<Self> {
        let data_len = match &data {
            FastExcelSeries::Null => None,
            FastExcelSeries::Bool(v) => Some(v.len()),
            FastExcelSeries::String(v) => Some(v.len()),
            FastExcelSeries::Int(v) => Some(v.len()),
            FastExcelSeries::Float(v) => Some(v.len()),
            FastExcelSeries::Datetime(v) => Some(v.len()),
            FastExcelSeries::Date(v) => Some(v.len()),
            FastExcelSeries::Duration(v) => Some(v.len()),
        };
        if let Some(len) = len
            && let Some(data_len) = data_len
            && data_len != len
        {
            return Err(FastExcelErrorKind::InvalidColumn(format!(
                "Column '{name}' has length {data_len} but expected {len}"
            ))
            .into());
        }
        let len = len.or(data_len).ok_or_else(|| {
            FastExcelErrorKind::InvalidColumn(
                "`len` is mandatory for `FastExcelSeries::Null`".to_string(),
            )
        })?;
        Ok(Self { name, data, len })
    }

    /// Create a new null series with the given name and length.
    pub fn new_null<S: Into<String>>(name: S, len: usize) -> Self {
        Self {
            name: name.into(),
            data: FastExcelSeries::Null,
            len,
        }
    }

    pub(crate) fn try_from_column_info<CT: CellType + DataType>(
        column_info: &ColumnInfo,
        data: &Range<CT>,
        offset: usize,
        limit: usize,
        whitespace_as_null: bool,
    ) -> FastExcelResult<Self> {
        let len = limit.checked_sub(offset).ok_or_else(|| {
            FastExcelErrorKind::InvalidParameters(format!(
                "limit is smaller than offset: {limit} is smaller than {offset}"
            ))
        })?;
        let data = match column_info.dtype {
            DType::Null => FastExcelSeries::Null,
            DType::Int => {
                FastExcelSeries::Int(create_int_vec(data, column_info.index, offset, limit))
            }
            DType::Float => {
                FastExcelSeries::Float(create_float_vec(data, column_info.index, offset, limit))
            }
            DType::String => FastExcelSeries::String(create_string_vec(
                data,
                column_info.index,
                offset,
                limit,
                whitespace_as_null,
            )),
            DType::Bool => {
                FastExcelSeries::Bool(create_boolean_vec(data, column_info.index, offset, limit))
            }
            DType::DateTime => FastExcelSeries::Datetime(create_datetime_vec(
                data,
                column_info.index,
                offset,
                limit,
            )),
            DType::Date => {
                FastExcelSeries::Date(create_date_vec(data, column_info.index, offset, limit))
            }
            DType::Duration => FastExcelSeries::Duration(create_duration_vec(
                data,
                column_info.index,
                offset,
                limit,
            )),
        };
        Ok(Self {
            name: column_info.name.clone(),
            data,
            len,
        })
    }

    pub fn len(&self) -> usize {
        self.len
    }

    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    pub fn name(&self) -> &str {
        &self.name
    }

    pub fn data(&self) -> &FastExcelSeries {
        &self.data
    }
}

impl From<FastExcelColumn> for FastExcelSeries {
    fn from(column: FastExcelColumn) -> Self {
        column.data
    }
}

/// Enum for lazy row selection - avoids materializing Vec for simple cases
#[derive(Debug)]
pub(crate) enum RowSelector {
    /// Simple range - no Vec allocation needed
    Range(std::ops::Range<usize>),
    /// Pre-filtered list of specific row indices
    Filtered(Vec<usize>),
}

impl RowSelector {
    pub(crate) fn len(&self) -> usize {
        match self {
            RowSelector::Range(range) => range.len(),
            RowSelector::Filtered(vec) => vec.len(),
        }
    }
}

/// Generate row selector based on [`SkipRows`] and range limits
pub(crate) fn generate_row_selector(
    skip_rows: &SkipRows,
    offset: usize,
    limit: usize,
) -> FastExcelResult<RowSelector> {
    match skip_rows {
        SkipRows::Simple(_skip_count) => {
            // For simple case, the offset has already been adjusted by pagination logic
            // So we just return the normal range - no Vec allocation!
            Ok(RowSelector::Range(offset..limit))
        }
        SkipRows::SkipEmptyRowsAtBeginning => {
            // For empty rows at beginning, calamine handles this at the header level
            // So we just return the normal range - no Vec allocation!
            Ok(RowSelector::Range(offset..limit))
        }
        SkipRows::List(skip_set) => {
            // Filter out rows that are in the skip set
            // `skip_set` contains data-relative indices, but we need to work with absolute indices
            let filtered: Vec<usize> = (offset..limit)
                .enumerate()
                .filter_map(|(data_row_idx, absolute_row_idx)| {
                    (!skip_set.contains(&data_row_idx)).then_some(absolute_row_idx)
                })
                .collect();
            Ok(RowSelector::Filtered(filtered))
        }
        #[cfg(feature = "python")]
        SkipRows::Callable(_func) => {
            // Call the Python function for each row to determine if it should be skipped
            // The callable should receive data-relative row indices (0, 1, 2, ...)
            pyo3::Python::attach(|py| {
                Ok(RowSelector::Filtered(
                    (offset..limit)
                        .enumerate()
                        .filter_map(|(data_row_idx, absolute_row_idx)| {
                            (!skip_rows.should_skip_row(data_row_idx, py).unwrap_or(false))
                                .then_some(absolute_row_idx)
                        })
                        .collect(),
                ))
            })
        }
    }
}


================================================
FILE: src/data/python.rs
================================================
use std::sync::Arc;
use std::{fmt::Debug, ops::Not};

use arrow_array::{
    Array, ArrayRef, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array,
    NullArray, RecordBatch, StringArray, TimestampMillisecondArray,
};
use arrow_schema::{Field, Schema};
use calamine::{CellType, DataType, Range};

use super::cell_extractors;
use crate::{
    data::{ExcelSheetData, RowSelector, generate_row_selector},
    error::{ErrorContext, FastExcelErrorKind, FastExcelResult},
    types::{
        dtype::DType,
        excelsheet::{CellError, CellErrors, SkipRows, column_info::ColumnInfo},
    },
};

mod with_error_impls {
    use super::*;

    pub(crate) fn create_boolean_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];

        let arr = Arc::new(BooleanArray::from_iter((offset..limit).map(|row| {
            data.get((row, col)).and_then(|cell| {
                if cell.is_empty() {
                    None
                } else if let Some(b) = cell_extractors::extract_boolean(cell) {
                    Some(b)
                } else {
                    cell_errors.push(CellError {
                        position: (row, col),
                        row_offset: offset,
                        detail: format!("Expected boolean but got '{cell:?}"),
                    });
                    None
                }
            })
        })));

        (arr, cell_errors)
    }

    pub(crate) fn create_int_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];

        let arr = Arc::new(Int64Array::from_iter((offset..limit).map(|row| {
            data.get((row, col)).and_then(|cell| {
                if cell.is_empty() {
                    None
                } else {
                    match cell_extractors::extract_int(cell) {
                        Some(value) => Some(value),
                        None => {
                            cell_errors.push(CellError {
                                position: (row, col),
                                row_offset: offset,
                                detail: format!("Expected int but got '{cell:?}'"),
                            });
                            None
                        }
                    }
                }
            })
        })));
        (arr, cell_errors)
    }

    pub(crate) fn create_float_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];

        let arr = Arc::new(Float64Array::from_iter((offset..limit).map(|row| {
            data.get((row, col)).and_then(|cell| {
                if cell.is_empty() {
                    None
                } else {
                    match cell_extractors::extract_float(cell) {
                        Some(value) => Some(value),
                        None => {
                            cell_errors.push(CellError {
                                position: (row, col),
                                row_offset: offset,
                                detail: format!("Expected float but got '{cell:?}'"),
                            });
                            None
                        }
                    }
                }
            })
        })));
        (arr, cell_errors)
    }

    pub(crate) fn create_string_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
        whitespace_as_null: bool,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];

        let arr = Arc::new(StringArray::from_iter((offset..limit).map(|row| {
            data.get((row, col)).and_then(|cell| {
                if cell.is_empty() {
                    None
                } else {
                    match cell_extractors::extract_string(cell) {
                        Some(value) => {
                            if whitespace_as_null && value.trim().is_empty() {
                                None
                            } else {
                                Some(value)
                            }
                        }
                        None => {
                            cell_errors.push(CellError {
                                position: (row, col),
                                row_offset: offset,
                                detail: format!("Expected string but got '{cell:?}'"),
                            });
                            None
                        }
                    }
                }
            })
        })));

        (arr, cell_errors)
    }

    pub(crate) fn create_date_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];

        let arr = Arc::new(Date32Array::from_iter((offset..limit).map(|row| {
            data.get((row, col)).and_then(|cell| {
                if cell.is_empty() {
                    None
                } else {
                    match cell_extractors::extract_date_as_num_days(cell) {
                        Some(value) => Some(value),
                        None => {
                            cell_errors.push(CellError {
                                position: (row, col),
                                row_offset: offset,
                                detail: format!("Expected date but got '{:?}'", cell),
                            });
                            None
                        }
                    }
                }
            })
        })));

        (arr, cell_errors)
    }

    pub(crate) fn create_datetime_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];
        let arr = Arc::new(TimestampMillisecondArray::from_iter((offset..limit).map(
            |row| {
                data.get((row, col)).and_then(|cell| {
                    if cell.is_empty() {
                        None
                    } else {
                        match cell_extractors::extract_datetime_as_timestamp_ms(cell) {
                            Some(value) => Some(value),
                            None => {
                                cell_errors.push(CellError {
                                    position: (row, col),
                                    row_offset: offset,
                                    detail: format!("Expected datetime but got '{:?}'", cell),
                                });
                                None
                            }
                        }
                    }
                })
            },
        )));
        (arr, cell_errors)
    }

    pub(crate) fn create_duration_array_with_errors<CT: CellType + DataType + Debug>(
        data: &Range<CT>,
        col: usize,
        offset: usize,
        limit: usize,
    ) -> (Arc<dyn Array>, Vec<CellError>) {
        let mut cell_errors = vec![];
        let arr = Arc::new(DurationMillisecondArray::from_iter((offset..limit).map(
            |row| {
                data.get((row, col)).and_then(|cell| {
                    if cell.is_empty() {
                        None
                    } else {
                        match cell_extractors::extract_duration_as_ms(cell) {
                            Some(value) => Some(value),
                            None => {
                                cell_errors.push(CellError {
                                    position: (row, col),
                                    row_offset: offset,
                                    detail: format!("Expected duration but got '{cell:?}'"),
                                });
                                None
                            }
                        }
                    }
                })
            },
        )));
        (arr, cell_errors)
    }
}

pub(crate) fn create_boolean_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(BooleanArray::from_iter(row_iter.map(|row| {
        data.get((row, col))
            .and_then(cell_extractors::extract_boolean)
    })))
}

pub(crate) fn create_int_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(Int64Array::from_iter(row_iter.map(|row| {
        data.get((row, col)).and_then(cell_extractors::extract_int)
    })))
}

pub(crate) fn create_float_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(Float64Array::from_iter(row_iter.map(|row| {
        data.get((row, col))
            .and_then(cell_extractors::extract_float)
    })))
}

pub(crate) fn create_string_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
    whitespace_as_null: bool,
) -> Arc<dyn Array> {
    Arc::new(if whitespace_as_null {
        StringArray::from_iter(row_iter.map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_string)
                // Only return the string if it contains non-whitespace characters
                .filter(|s| s.trim().is_empty().not())
        }))
    } else {
        StringArray::from_iter(row_iter.map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_string)
        }))
    })
}

pub(crate) fn create_date_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(Date32Array::from_iter(row_iter.map(|row| {
        data.get((row, col))
            .and_then(cell_extractors::extract_date_as_num_days)
    })))
}

pub(crate) fn create_datetime_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(TimestampMillisecondArray::from_iter(row_iter.map(|row| {
        data.get((row, col))
            .and_then(cell_extractors::extract_datetime_as_timestamp_ms)
    })))
}

pub(crate) fn create_duration_array<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    row_iter: impl Iterator<Item = usize>,
) -> Arc<dyn Array> {
    Arc::new(DurationMillisecondArray::from_iter(row_iter.map(|row| {
        data.get((row, col))
            .and_then(cell_extractors::extract_duration_as_ms)
    })))
}

macro_rules! create_array_function_with_errors {
    ($func_name:ident) => {
        pub(crate) fn $func_name(
            data: &ExcelSheetData,
            col: usize,
            offset: usize,
            limit: usize,
        ) -> (Arc<dyn Array>, Vec<CellError>) {
            match data {
                ExcelSheetData::Owned(range) => {
                    with_error_impls::$func_name(range, col, offset, limit)
                }
                ExcelSheetData::Ref(range) => {
                    with_error_impls::$func_name(range, col, offset, limit)
                }
            }
        }
    };
}

create_array_function_with_errors!(create_boolean_array_with_errors);
create_array_function_with_errors!(create_int_array_with_errors);
create_array_function_with_errors!(create_float_array_with_errors);
create_array_function_with_errors!(create_date_array_with_errors);
create_array_function_with_errors!(create_datetime_array_with_errors);
create_array_function_with_errors!(create_duration_array_with_errors);

pub(crate) fn create_string_array_with_errors(
    data: &ExcelSheetData,
    col: usize,
    offset: usize,
    limit: usize,
    whitespace_as_null: bool,
) -> (Arc<dyn Array>, Vec<CellError>) {
    match data {
        ExcelSheetData::Owned(range) => with_error_impls::create_string_array_with_errors(
            range,
            col,
            offset,
            limit,
            whitespace_as_null,
        ),
        ExcelSheetData::Ref(range) => with_error_impls::create_string_array_with_errors(
            range,
            col,
            offset,
            limit,
            whitespace_as_null,
        ),
    }
}

/// Converts a list of ColumnInfo to an arrow Schema
pub(crate) fn selected_columns_to_schema(columns: &[ColumnInfo]) -> Schema {
    let fields: Vec<_> = columns.iter().map(Into::<Field>::into).collect();
    Schema::new(fields)
}

/// Creates an arrow RecordBatch from an Iterator over (column_name, column data tuples) and an arrow schema
pub(crate) fn record_batch_from_name_array_iterator<
    'a,
    I: Iterator<Item = (&'a str, Arc<dyn Array>)>,
>(
    iter: I,
    schema: Schema,
) -> FastExcelResult<RecordBatch> {
    let mut iter = iter.peekable();
    // If the iterable is empty, try_from_iter returns an Err
    if iter.peek().is_none() {
        Ok(RecordBatch::new_empty(Arc::new(schema)))
    } else {
        // We use `try_from_iter_with_nullable` because `try_from_iter` relies on `array.null_count() > 0;`
        // to determine if the array is nullable. This is not the case for `NullArray` which has no nulls.
        RecordBatch::try_from_iter_with_nullable(iter.map(|(field_name, array)| {
            let nullable = array.is_nullable();
            (field_name, array, nullable)
        }))
        .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into())
        .with_context(|| "could not create RecordBatch from iterable")
    }
}

/// Creates an arrow `RecordBatch` from `ExcelSheetData`. Expects the following parameters:
/// * `columns`: a slice of `ColumnInfo`, representing the columns that should be extracted from the range
/// * `data`: the sheets data, as an `ExcelSheetData`
/// * `offset`: the row index at which to start
/// * `limit`: the row index at which to stop (excluded)
pub(crate) fn record_batch_from_data_and_columns<CT: CellType + DataType>(
    columns: &[ColumnInfo],
    data: &Range<CT>,
    offset: usize,
    limit: usize,
    whitespace_as_null: bool,
) -> FastExcelResult<RecordBatch> {
    // Use RowSelector::Range for simple offset..limit case - no Vec allocation!
    let row_selector = RowSelector::Range(offset..limit);
    record_batch_from_data_and_columns_with_row_selector(
        columns,
        data,
        &row_selector,
        whitespace_as_null,
    )
}

pub(crate) fn record_batch_from_data_and_columns_with_skip_rows<CT: CellType + DataType>(
    columns: &[ColumnInfo],
    data: &Range<CT>,
    skip_rows: &SkipRows,
    offset: usize,
    limit: usize,
    whitespace_as_null: bool,
) -> FastExcelResult<RecordBatch> {
    // Generate row selector - ranges for simple cases, filtered Vec only when needed
    let row_selector = generate_row_selector(skip_rows, offset, limit)?;
    record_batch_from_data_and_columns_with_row_selector(
        columns,
        data,
        &row_selector,
        whitespace_as_null,
    )
}

fn record_batch_from_data_and_columns_with_row_selector<CT: CellType + DataType>(
    columns: &[ColumnInfo],
    data: &Range<CT>,
    row_selector: &RowSelector,
    whitespace_as_null: bool,
) -> FastExcelResult<RecordBatch> {
    let schema = selected_columns_to_schema(columns);
    let row_count = row_selector.len();
    let iter = columns.iter().map(|column_info| {
        let col_idx = column_info.index;
        let dtype = column_info.dtype;
        (
            column_info.name.as_str(),
            match dtype {
                DType::Null => Arc::new(NullArray::new(row_count)),
                DType::Int => create_int_array(data, col_idx, row_selector.iter()),
                DType::Float => create_float_array(data, col_idx, row_selector.iter()),
                DType::String => {
                    create_string_array(data, col_idx, row_selector.iter(), whitespace_as_null)
                }
                DType::Bool => create_boolean_array(data, col_idx, row_selector.iter()),
                DType::DateTime => create_datetime_array(data, col_idx, row_selector.iter()),
                DType::Date => create_date_array(data, col_idx, row_selector.iter()),
                DType::Duration => create_duration_array(data, col_idx, row_selector.iter()),
            },
        )
    });

    record_batch_from_name_array_iterator(iter, schema)
}

pub(crate) fn record_batch_from_data_and_columns_with_errors(
    columns: &[ColumnInfo],
    data: &ExcelSheetData,
    offset: usize,
    limit: usize,
    whitespace_as_null: bool,
) -> FastExcelResult<(RecordBatch, CellErrors)> {
    let schema = selected_columns_to_schema(columns);

    let mut cell_errors = vec![];

    let iter = columns.iter().map(|column_info| {
        let col_idx = column_info.index;
        let dtype = column_info.dtype;

        let (array, new_cell_errors) = match dtype {
            DType::Null => (Arc::new(NullArray::new(limit - offset)) as ArrayRef, vec![]),
            DType::Int => create_int_array_with_errors(data, col_idx, offset, limit),
            DType::Float => create_float_array_with_errors(data, col_idx, offset, limit),
            DType::String => {
                create_string_array_with_errors(data, col_idx, offset, limit, whitespace_as_null)
            }
            DType::Bool => create_boolean_array_with_errors(data, col_idx, offset, limit),
            DType::DateTime => create_datetime_array_with_errors(data, col_idx, offset, limit),
            DType::Date => create_date_array_with_errors(data, col_idx, offset, limit),
            DType::Duration => create_duration_array_with_errors(data, col_idx, offset, limit),
        };

        cell_errors.extend(new_cell_errors);

        (column_info.name.as_str(), array)
    });

    let record_batch = record_batch_from_name_array_iterator(iter, schema)?;

    Ok((
        record_batch,
        CellErrors {
            errors: cell_errors,
        },
    ))
}

impl RowSelector {
    pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = usize> + '_> {
        match self {
            RowSelector::Range(range) => Box::new(range.clone()),
            RowSelector::Filtered(vec) => Box::new(vec.iter().copied()),
        }
    }
}


================================================
FILE: src/data/rust.rs
================================================
use std::ops::Not;

use calamine::{CellType, DataType, Range};
use chrono::{NaiveDate, NaiveDateTime, TimeDelta};

use super::cell_extractors;

pub(crate) fn create_boolean_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<bool>> {
    (offset..limit)
        .map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_boolean)
        })
        .collect()
}

pub(crate) fn create_int_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<i64>> {
    (offset..limit)
        .map(|row| data.get((row, col)).and_then(cell_extractors::extract_int))
        .collect()
}

pub(crate) fn create_float_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<f64>> {
    (offset..limit)
        .map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_float)
        })
        .collect()
}

pub(crate) fn create_string_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
    whitespace_as_null: bool,
) -> Vec<Option<String>> {
    if whitespace_as_null {
        (offset..limit)
            .map(|row| {
                data.get((row, col))
                    .and_then(cell_extractors::extract_string)
                    // Only return the string if it contains non-whitespace characters
                    .filter(|s| s.trim().is_empty().not())
            })
            .collect()
    } else {
        (offset..limit)
            .map(|row| {
                data.get((row, col))
                    .and_then(cell_extractors::extract_string)
            })
            .collect()
    }
}

pub(crate) fn create_date_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<NaiveDate>> {
    (offset..limit)
        .map(|row| data.get((row, col)).and_then(cell_extractors::extract_date))
        .collect()
}

pub(crate) fn create_datetime_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<NaiveDateTime>> {
    (offset..limit)
        .map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_datetime)
        })
        .collect()
}

pub(crate) fn create_duration_vec<CT: CellType + DataType>(
    data: &Range<CT>,
    col: usize,
    offset: usize,
    limit: usize,
) -> Vec<Option<TimeDelta>> {
    (offset..limit)
        .map(|row| {
            data.get((row, col))
                .and_then(cell_extractors::extract_duration)
        })
        .collect()
}


================================================
FILE: src/error.rs
================================================
use crate::types::idx_or_name::IdxOrName;
use calamine::XlsxError;
use std::{error::Error, fmt::Display};

/// The kind of a fastexcel error.
#[derive(Debug)]
pub enum FastExcelErrorKind {
    UnsupportedColumnTypeCombination(String),
    CannotRetrieveCellData(usize, usize),
    CalamineCellError(calamine::CellErrorType),
    CalamineError(calamine::Error),
    SheetNotFound(IdxOrName),
    ColumnNotFound(IdxOrName),
    // Arrow errors can be of several different types (arrow::error::Error, PyError), and having
    // the actual type has not much value for us, so we just store a string context
    ArrowError(String),
    InvalidParameters(String),
    InvalidColumn(String),
    Internal(String),
}

impl Display for FastExcelErrorKind {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            FastExcelErrorKind::UnsupportedColumnTypeCombination(detail) => {
                write!(f, "unsupported column type combination: {detail}")
            }
            FastExcelErrorKind::CannotRetrieveCellData(row, col) => {
                write!(f, "cannot retrieve cell data at ({row}, {col})")
            }
            FastExcelErrorKind::CalamineCellError(calamine_error) => {
                write!(f, "calamine cell error: {calamine_error}")
            }
            FastExcelErrorKind::CalamineError(calamine_error) => {
                write!(f, "calamine error: {calamine_error}")
            }
            FastExcelErrorKind::SheetNotFound(idx_or_name) => {
                let message = idx_or_name.format_message();
                write!(f, "sheet {message} not found")
            }
            FastExcelErrorKind::ColumnNotFound(idx_or_name) => {
                let message = idx_or_name.format_message();
                write!(f, "column {message} not found")
            }
            FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"),
            FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"),
            FastExcelErrorKind::InvalidColumn(err) => write!(f, "invalid column: {err}"),
            FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"),
        }
    }
}

/// A `fastexcel` error.
///
/// Contains a kind and a context. Use the `Display` trait to format the
/// error message with its context.
#[derive(Debug)]
pub struct FastExcelError {
    pub kind: FastExcelErrorKind,
    pub context: Vec<String>,
}

pub(crate) trait ErrorContext {
    fn with_context<S: ToString, F>(self, ctx_fn: F) -> Self
    where
        F: FnOnce() -> S;
}

impl FastExcelError {
    pub(crate) fn new(kind: FastExcelErrorKind) -> Self {
        Self {
            kind,
            context: vec![],
        }
    }
}

impl Display for FastExcelError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{kind}", kind = self.kind)?;
        if !self.context.is_empty() {
            writeln!(f, "\nContext:")?;

            self.context
                .iter()
                .enumerate()
                .try_for_each(|(idx, ctx_value)| writeln!(f, "    {idx}: {ctx_value}"))?;
        }
        Ok(())
    }
}

impl Error for FastExcelError {}

impl ErrorContext for FastExcelError {
    fn with_context<S: ToString, F>(mut self, ctx_fn: F) -> Self
    where
        F: FnOnce() -> S,
    {
        self.context.push(ctx_fn().to_string());
        self
    }
}

impl From<FastExcelErrorKind> for FastExcelError {
    fn from(kind: FastExcelErrorKind) -> Self {
        FastExcelError::new(kind)
    }
}

impl From<XlsxError> for FastExcelError {
    fn from(err: XlsxError) -> Self {
        FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into()
    }
}

pub type FastExcelResult<T> = Result<T, FastExcelError>;

impl<T> ErrorContext for FastExcelResult<T> {
    fn with_context<S: ToString, F>(self, ctx_fn: F) -> Self
    where
        F: FnOnce() -> S,
    {
        match self {
            Ok(_) => self,
            Err(e) => Err(e.with_context(ctx_fn)),
        }
    }
}

/// Contains Python versions of our custom errors
#[cfg(feature = "python")]
pub(crate) mod py_errors {
    use super::FastExcelErrorKind;
    use crate::error;
    use pyo3::{PyErr, PyResult, create_exception, exceptions::PyException};

    // Base fastexcel error
    create_exception!(
        _fastexcel,
        FastExcelError,
        PyException,
        "The base class for all fastexcel errors"
    );
    // Unsupported column type
    create_exception!(
        _fastexcel,
        UnsupportedColumnTypeCombinationError,
        FastExcelError,
        "Column contains an unsupported type combination"
    );
    // Cannot retrieve cell data
    create_exception!(
        _fastexcel,
        CannotRetrieveCellDataError,
        FastExcelError,
        "Data for a given cell cannot be retrieved"
    );
    // Calamine cell error
    create_exception!(
        _fastexcel,
        CalamineCellError,
        FastExcelError,
        "calamine returned an error regarding the content of the cell"
    );
    // Calamine error
    create_exception!(
        _fastexcel,
        CalamineError,
        FastExcelError,
        "Generic calamine error"
    );
    // Sheet not found
    create_exception!(
        _fastexcel,
        SheetNotFoundError,
        FastExcelError,
        "Sheet was not found"
    );
    // Sheet not found
    create_exception!(
        _fastexcel,
        ColumnNotFoundError,
        FastExcelError,
        "Column was not found"
    );
    // Arrow error
    create_exception!(
        _fastexcel,
        ArrowError,
        FastExcelError,
        "Generic arrow error"
    );
    // Invalid parameters
    create_exception!(
        _fastexcel,
        InvalidParametersError,
        FastExcelError,
        "Provided parameters are invalid"
    );
    // Invalid column
    create_exception!(
        _fastexcel,
        InvalidColumnError,
        FastExcelError,
        "Column is invalid"
    );
    // Internal error
    create_exception!(
        _fastexcel,
        InternalError,
        FastExcelError,
        "Internal fastexcel error"
    );

    impl From<error::FastExcelError> for PyErr {
        fn from(err: error::FastExcelError) -> Self {
            let message = err.to_string();
            match err.kind {
                FastExcelErrorKind::UnsupportedColumnTypeCombination(_) => {
                    UnsupportedColumnTypeCombinationError::new_err(message)
                }
                FastExcelErrorKind::CannotRetrieveCellData(_, _) => {
                    CannotRetrieveCellDataError::new_err(message)
                }
                FastExcelErrorKind::CalamineCellError(_) => CalamineCellError::new_err(message),
                FastExcelErrorKind::CalamineError(_) => CalamineError::new_err(message),
                FastExcelErrorKind::SheetNotFound(_) => SheetNotFoundError::new_err(message),
                FastExcelErrorKind::ColumnNotFound(_) => ColumnNotFoundError::new_err(message),
                FastExcelErrorKind::ArrowError(_) => ArrowError::new_err(message),
                FastExcelErrorKind::InvalidParameters(_) => {
                    InvalidParametersError::new_err(message)
                }
                FastExcelErrorKind::InvalidColumn(_) => InvalidColumnError::new_err(message),
                FastExcelErrorKind::Internal(_) => ArrowError::new_err(message),
            }
        }
    }

    pub(crate) trait IntoPyResult {
        type Inner;

        fn into_pyresult(self) -> PyResult<Self::Inner>;
    }

    impl<T> IntoPyResult for super::FastExcelResult<T> {
        type Inner = T;

        fn into_pyresult(self) -> PyResult<Self::Inner> {
            self.map_err(Into::into)
        }
    }
}


================================================
FILE: src/lib.rs
================================================
mod data;
mod error;
mod types;
mod utils;

use std::fmt::Display;

#[cfg(feature = "python")]
use error::py_errors;
#[cfg(feature = "python")]
use pyo3::prelude::*;
#[cfg(feature = "python")]
use types::excelsheet::{CellError, CellErrors};

pub use data::{FastExcelColumn, FastExcelSeries};
use error::ErrorContext;
pub use error::{FastExcelError, FastExcelErrorKind, FastExcelResult};
pub use types::{
    ColumnInfo, ColumnNameFrom, DType, DTypeCoercion, DTypeFrom, DTypes, DefinedName, ExcelReader,
    ExcelSheet, ExcelTable, IdxOrName, LoadSheetOrTableOptions, SelectedColumns, SheetVisible,
    SkipRows,
};

/// Reads an excel file and returns an object allowing to access its sheets, tables, and a bit of metadata.
/// This is a wrapper around `ExcelReader::try_from_path`.
pub fn read_excel<S: AsRef<str> + Display>(path: S) -> FastExcelResult<ExcelReader> {
    ExcelReader::try_from_path(path.as_ref())
        .with_context(|| format!("could not load excel file at {path}"))
}

#[cfg(feature = "python")]
/// Reads an excel file and returns an object allowing to access its sheets, tables, and a bit of metadata
#[pyfunction(name = "read_excel")]
fn py_read_excel<'py>(source: &Bound<'_, PyAny>, py: Python<'py>) -> PyResult<ExcelReader> {
    use py_errors::IntoPyResult;

    if let Ok(path) = source.extract::<String>() {
        py.detach(|| ExcelReader::try_from_path(&path))
            .with_context(|| format!("could not load excel file at {path}"))
            .into_pyresult()
    } else if let Ok(bytes) = source.extract::<&[u8]>() {
        py.detach(|| ExcelReader::try_from(bytes))
            .with_context(|| "could not load excel file for those bytes")
            .into_pyresult()
    } else {
        Err(py_errors::InvalidParametersError::new_err(
            "source must be a string or bytes",
        ))
    }
}

// Taken from pydantic-core:
// https://github.com/pydantic/pydantic-core/blob/main/src/lib.rs#L24
#[cfg(feature = "python")]
fn get_python_version() -> String {
    let version = env!("CARGO_PKG_VERSION").to_string();
    // cargo uses "1.0-alpha1" etc. while python uses "1.0.0a1", this is not full compatibility,
    // but it's good enough for now
    // see https://docs.rs/semver/1.0.9/semver/struct.Version.html#method.parse for rust spec
    // see https://peps.python.org/pep-0440/ for python spec
    // it seems the dot after "alpha/beta" e.g. "-alpha.1" is not necessary, hence why this works
    version.replace("-alpha", "a").replace("-beta", "b")
}

#[cfg(feature = "python")]
#[pymodule(gil_used = false)]
fn _fastexcel(m: &Bound<'_, PyModule>) -> PyResult<()> {
    use crate::types::excelsheet::column_info::{ColumnInfo, ColumnInfoNoDtype};

    pyo3_log::init();

    let py = m.py();
    m.add_function(wrap_pyfunction!(py_read_excel, m)?)?;
    m.add_class::<ColumnInfo>()?;
    m.add_class::<ColumnInfoNoDtype>()?;
    m.add_class::<DefinedName>()?;
    m.add_class::<CellError>()?;
    m.add_class::<CellErrors>()?;
    m.add_class::<ExcelSheet>()?;
    m.add_class::<ExcelReader>()?;
    m.add_class::<ExcelTable>()?;
    m.add("__version__", get_python_version())?;

    // errors
    [
        ("FastExcelError", py.get_type::<py_errors::FastExcelError>()),
        (
            "UnsupportedColumnTypeCombinationError",
            py.get_type::<py_errors::UnsupportedColumnTypeCombinationError>(),
        ),
        (
            "CannotRetrieveCellDataError",
            py.get_type::<py_errors::CannotRetrieveCellDataError>(),
        ),
        (
            "CalamineCellError",
            py.get_type::<py_errors::CalamineCellError>(),
        ),
        ("CalamineError", py.get_type::<py_errors::CalamineError>()),
        (
            "SheetNotFoundError",
            py.get_type::<py_errors::SheetNotFoundError>(),
        ),
        (
            "ColumnNotFoundError",
            py.get_type::<py_errors::ColumnNotFoundError>(),
        ),
        ("ArrowError", py.get_type::<py_errors::ArrowError>()),
        (
            "InvalidParametersError",
            py.get_type::<py_errors::InvalidParametersError>(),
        ),
    ]
    .into_iter()
    .try_for_each(|(exc_name, exc_type)| m.add(exc_name, exc_type))
}


================================================
FILE: src/types/dtype/mod.rs
================================================
#[cfg(feature = "python")]
mod python;

use std::{
    collections::{HashMap, HashSet},
    fmt::{Debug, Display},
    str::FromStr,
    sync::OnceLock,
};

use calamine::{CellErrorType, CellType, DataType, Range};
use log::warn;
#[cfg(feature = "python")]
use pyo3::{IntoPyObject, IntoPyObjectRef};

use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};

use super::idx_or_name::IdxOrName;

/// A column or a cell's data type.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)]
pub enum DType {
    Null,
    Int,
    Float,
    String,
    Bool,
    DateTime,
    Date,
    Duration,
}

impl FromStr for DType {
    type Err = FastExcelError;

    fn from_str(raw_dtype: &str) -> FastExcelResult<Self> {
        match raw_dtype {
            "null" => Ok(Self::Null),
            "int" => Ok(Self::Int),
            "float" => Ok(Self::Float),
            "string" => Ok(Self::String),
            "boolean" => Ok(Self::Bool),
            "datetime" => Ok(Self::DateTime),
            "date" => Ok(Self::Date),
            "duration" => Ok(Self::Duration),
            _ => Err(FastExcelErrorKind::InvalidParameters(format!(
                "unsupported dtype: \"{raw_dtype}\""
            ))
            .into()),
        }
    }
}

impl Display for DType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(match self {
            DType::Null => "null",
            DType::Int => "int",
            DType::Float => "float",
            DType::String => "string",
            DType::Bool => "boolean",
            DType::DateTime => "datetime",
            DType::Date => "date",
            DType::Duration => "duration",
        })
    }
}

pub type DTypeMap = HashMap<IdxOrName, DType>;

/// Provided data types.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "python", derive(IntoPyObject, IntoPyObjectRef))]
pub enum DTypes {
    /// Coerce all data types to the given type.
    All(DType),
    /// Coerce data types based on the provided map.
    Map(DTypeMap),
}

impl FromStr for DTypes {
    type Err = FastExcelError;

    fn from_str(dtypes: &str) -> FastExcelResult<Self> {
        Ok(DTypes::All(DType::from_str(dtypes)?))
    }
}

/// Whether data types should be coerced or not.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy, Default)]
pub enum DTypeCoercion {
    /// Coerce data types (default).
    #[default]
    Coerce,
    /// Strictly enforce data types.
    Strict,
}

impl FromStr for DTypeCoercion {
    type Err = FastExcelError;

    fn from_str(raw_dtype_coercion: &str) -> FastExcelResult<Self> {
        match raw_dtype_coercion {
            "coerce" => Ok(Self::Coerce),
            "strict" => Ok(Self::Strict),
            _ => Err(FastExcelErrorKind::InvalidParameters(format!(
                "unsupported dtype_coercion: \"{raw_dtype_coercion}\""
            ))
            .into()),
        }
    }
}

/// All the possible string values that should be considered as NULL
const NULL_STRING_VALUES: [&str; 19] = [
    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
    "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
];

fn get_cell_dtype<DT: CellType + Debug + DataType>(
    data: &Range<DT>,
    row: usize,
    col: usize,
    whitespace_as_null: bool,
) -> FastExcelResult<DType> {
    let cell = data
        .get((row, col))
        .ok_or(FastExcelErrorKind::CannotRetrieveCellData(row, col))?;

    if cell.is_int() {
        Ok(DType::Int)
    } else if cell.is_float() {
        Ok(DType::Float)
    } else if cell.is_string() {
        if NULL_STRING_VALUES.contains(&cell.get_string().unwrap())
        // If we want to consider whitespace as null and either the cell is empty or contains only
        // whitespace, we return null
            || (whitespace_as_null
            && cell
                .get_string()
                .is_none_or(|s| s.trim().is_empty()))
        {
            Ok(DType::Null)
        } else {
            Ok(DType::String)
        }
    } else if cell.is_bool() {
        Ok(DType::Bool)
    } else if cell.is_datetime() {
        // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be
        // a duration or a datatime
        let excel_datetime = cell
            .get_datetime()
            .expect("calamine indicated that cell is a datetime but get_datetime returned None");
        Ok(if excel_datetime.is_datetime() {
            DType::DateTime
        } else {
            DType::Duration
        })
    }
    // These types contain an ISO8601 representation of a date/datetime or a durat
    else if cell.is_datetime_iso() {
        match cell.as_datetime() {
            // If we cannot convert the cell to a datetime, we're working on a date
            Some(_) => Ok(DType::DateTime),
            // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime
            // rather than a date
            None => Ok(DType::Date),
        }
    }
    // Simple durations
    else if cell.is_duration_iso() {
        Ok(DType::Duration)
    }
    // Empty cell
    else if cell.is_empty() {
        Ok(DType::Null)
    } else if cell.is_error() {
        match cell.get_error() {
            // considering cells with #N/A! or #REF! as null
            Some(
                CellErrorType::NA
                | CellErrorType::Value
                | CellErrorType::Null
                | CellErrorType::Ref
                | CellErrorType::Num
                | CellErrorType::Div0,
            ) => Ok(DType::Null),
            Some(err) => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()),
            None => Err(FastExcelErrorKind::Internal(format!(
                "cell is an error but get_error returned None: {cell:?}"
            ))
            .into()),
        }
    } else {
        Err(FastExcelErrorKind::Internal(format!("unsupported cell type: {cell:?}")).into())
    }
}

static FLOAT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
static INT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
static STRING_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();

fn float_types() -> &'static HashSet<DType> {
    FLOAT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::Bool]))
}

fn int_types() -> &'static HashSet<DType> {
    INT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Bool]))
}

fn string_types() -> &'static HashSet<DType> {
    STRING_TYPES_CELL.get_or_init(|| {
        HashSet::from([
            DType::Bool,
            DType::Int,
            DType::Float,
            DType::String,
            DType::DateTime,
            DType::Date,
        ])
    })
}

pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
    data: &Range<DT>,
    start_row: usize,
    end_row: usize,
    col: usize,
    dtype_coercion: &DTypeCoercion,
    whitespace_as_null: bool,
) -> FastExcelResult<DType> {
    let mut column_types = (start_row..end_row)
        .map(|row| get_cell_dtype(data, row, col, whitespace_as_null))
        .collect::<FastExcelResult<HashSet<_>>>()?;

    // All columns are nullable anyway so we're not taking Null into account here
    column_types.remove(&DType::Null);

    if column_types.is_empty() {
        // If no type apart from NULL was found, fallback to string except if the column is empty
        if start_row == end_row {
            Ok(DType::Null)
        } else {
            warn!("Could not determine dtype for column {col}, falling back to string");
            Ok(DType::String)
        }
    } else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 {
        // If dtype coercion is strict and we do not have a single dtype, it's an error
        Err(
            FastExcelErrorKind::UnsupportedColumnTypeCombination(format!(
                "type coercion is strict and column contains {column_types:?}"
            ))
            .into(),
        )
    } else if column_types.len() == 1 {
        // If a single non-null type was found, return it
        Ok(column_types.into_iter().next().unwrap())
    } else if column_types.is_subset(int_types()) {
        // If every cell in the column can be converted to an int, return int64
        Ok(DType::Int)
    } else if column_types.is_subset(float_types()) {
        // If every cell in the column can be converted to a float, return Float64
        Ok(DType::Float)
    } else if column_types.is_subset(string_types()) {
        // If every cell in the column can be converted to a string, return Utf8
        Ok(DType::String)
    } else {
        // NOTE: Not being too smart about multi-types columns for now
        Err(
            FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}"))
                .into(),
        )
    }
}

/// Convert a float to a nice string to mimic Excel behaviour.
///
/// Excel can store a float like 29.02 set by the user as "29.020000000000003" in the XML.
/// But in fact, the user will see "29.02" in the cell.
/// Excel indeed displays decimal numbers with 8 digits in a standard cell width
/// and 10 digits in a wide cell. Like this:
///
/// Format = 0.000000000 |  Unformatted, wide cell  | Unformatted, standard width
/// ---------------------|--------------------------|----------------------------
///     1.123456789      |        1.123456789       |           1.123457
///    12.123456789      |        12.12345679       |           12.12346
///         ...          |            ...           |              ...
///   123456.123456789   |        123456.1235       |           123456.1
///
/// Excel also trims trailing zeros and the decimal point if there is no fractional part.
///
/// We do not distinguish between wide cells and standard cells here, so we retain at most
/// nine digits after the decimal point and trim any trailing zeros.
pub(crate) fn excel_float_to_string(x: f64) -> String {
    format!("{x:.9}")
        .trim_end_matches('0')
        .trim_end_matches('.')
        .to_string()
}

#[cfg(feature = "__pyo3-tests")]
#[cfg(test)]
mod tests {
    use calamine::{Cell, Data as CalData};
    use pretty_assertions::assert_eq;
    use rstest::{fixture, rstest};

    use super::*;

    #[fixture]
    fn range() -> Range<CalData> {
        Range::from_sparse(vec![
            // First column
            Cell::new((0, 0), CalData::Bool(true)),
            Cell::new((1, 0), CalData::Bool(false)),
            Cell::new((2, 0), CalData::String("NULL".to_string())),
            Cell::new((3, 0), CalData::Int(42)),
            Cell::new((4, 0), CalData::Float(13.37)),
            Cell::new((5, 0), CalData::String("hello".to_string())),
            Cell::new((6, 0), CalData::Empty),
            Cell::new((7, 0), CalData::String("#N/A".to_string())),
            Cell::new((8, 0), CalData::Int(12)),
            Cell::new((9, 0), CalData::Float(12.21)),
            Cell::new((10, 0), CalData::Bool(true)),
            Cell::new((11, 0), CalData::Int(1337)),
        ])
    }

    #[rstest]
    // pure bool
    #[case(0, 2, DType::Bool)]
    // pure int
    #[case(3, 4, DType::Int)]
    // pure float
    #[case(4, 5, DType::Float)]
    // pure string
    #[case(5, 6, DType::String)]
    // pure int + float
    #[case(3, 5, DType::Float)]
    // null + int + float
    #[case(2, 5, DType::Float)]
    // float + string
    #[case(4, 6, DType::String)]
    // int + float + string
    #[case(3, 6, DType::String)]
    // null + int + float + string + empty + null
    #[case(2, 8, DType::String)]
    // empty + null + int
    #[case(6, 9, DType::Int)]
    // int + float + null
    #[case(7, 10, DType::Float)]
    // int + float + bool + null
    #[case(7, 11, DType::Float)]
    // int + bool
    #[case(10, 12, DType::Int)]
    fn get_arrow_column_type_multi_dtype_ok_coerce(
        range: Range<CalData>,
        #[case] start_row: usize,
        #[case] end_row: usize,
        #[case] expected: DType,
    ) {
        assert_eq!(
            get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Coerce, false)
                .unwrap(),
            expected
        );
    }

    #[rstest]
    // pure bool
    #[case(0, 2, DType::Bool)]
    // pure int
    #[case(3, 4, DType::Int)]
    // pure float
    #[case(4, 5, DType::Float)]
    // pure string
    #[case(5, 6, DType::String)]
    // empty + null + int
    #[case(6, 9, DType::Int)]
    fn get_arrow_column_type_multi_dtype_ok_strict(
        range: Range<CalData>,
        #[case] start_row: usize,
        #[case] end_row: usize,
        #[case] expected: DType,
    ) {
        assert_eq!(
            get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false)
                .unwrap(),
            expected
        );
    }

    #[rstest]
    // pure int + float
    #[case(3, 5)]
    // float + string
    #[case(4, 6)]
    // int + float + string
    #[case(3, 6)]
    // null + int + float + string + empty + null
    #[case(2, 8)]
    // int + float + null
    #[case(7, 10)]
    // int + float + bool + null
    #[case(7, 11)]
    // int + bool
    #[case(10, 12)]
    fn get_arrow_column_type_multi_dtype_ko_strict(
        range: Range<CalData>,
        #[case] start_row: usize,
        #[case] end_row: usize,
    ) {
        let result =
            get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false);
        assert!(matches!(
            result.unwrap_err().kind,
            FastExcelErrorKind::UnsupportedColumnTypeCombination(_)
        ));
    }

    #[rstest]
    #[case(29.020000000000003, "29.02")]
    #[case(10000_f64, "10000")]
    #[case(23.0, "23")]
    fn test_excel_float_to_string(#[case] x: f64, #[case] expected: &str) {
        assert_eq!(excel_float_to_string(x), expected.to_string());
    }
}


================================================
FILE: src/types/dtype/python.rs
================================================
use arrow_schema::{DataType as ArrowDataType, TimeUnit};
use pyo3::{Borrowed, Bound, FromPyObject, IntoPyObject, PyAny, PyErr, Python, types::PyString};

use crate::{
    error::{FastExcelErrorKind, py_errors::IntoPyResult},
    types::dtype::{DType, DTypeCoercion, DTypeMap, DTypes},
};

impl<'py> IntoPyObject<'py> for DType {
    type Target = PyString;

    type Output = Bound<'py, Self::Target>;

    type Error = std::convert::Infallible;

    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
        self.to_string().into_pyobject(py)
    }
}

impl<'py> IntoPyObject<'py> for &DType {
    type Target = PyString;

    type Output = Bound<'py, Self::Target>;

    type Error = std::convert::Infallible;

    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
        self.to_string().into_pyobject(py)
    }
}

impl<'a, 'py> FromPyObject<'a, 'py> for DType {
    type Error = PyErr;
    fn extract(py_dtype: Borrowed<'a, 'py, PyAny>) -> Result<Self, Self::Error> {
        if let Ok(dtype_pystr) = py_dtype.extract::<String>() {
            dtype_pystr.parse()
        } else {
            Err(FastExcelErrorKind::InvalidParameters(format!(
                "{py_dtype:?} cannot be converted to str"
            ))
            .into())
        }
        .into_pyresult()
    }
}

impl<'a, 'py> FromPyObject<'a, 'py> for DTypes {
    type Error = PyErr;
    fn extract(py_dtypes: Borrowed<'a, 'py, PyAny>) -> Result<Self, Self::Error> {
        if let Ok(py_dtypes_str) = py_dtypes.extract::<String>() {
            py_dtypes_str.parse()
        } else {
            Ok(DTypes::Map(py_dtypes.extract::<DTypeMap>()?))
        }
        .into_pyresult()
    }
}

impl From<&DType> for ArrowDataType {
    fn from(dtype: &DType) -> Self {
        match dtype {
            DType::Null => ArrowDataType::Null,
            DType::Int => ArrowDataType::Int64,
            DType::Float => ArrowDataType::Float64,
            DType::String => ArrowDataType::Utf8,
            DType::Bool => ArrowDataType::Boolean,
            DType::DateTime => ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
            DType::Date => ArrowDataType::Date32,
            DType::Duration => ArrowDataType::Duration(TimeUnit::Millisecond),
        }
    }
}

impl<'a, 'py> FromPyObject<'a, 'py> for DTypeCoercion {
    type Error = PyErr;
    fn extract(py_dtype_coercion: Borrowed<'a, 'py, PyAny>) -> Result<Self, Self::Error> {
        if let Ok(dtype_coercion_pystr) = py_dtype_coercion.extract::<String>() {
            dtype_coercion_pystr.parse()
        } else {
            Err(FastExcelErrorKind::InvalidParameters(format!(
                "{py_dtype_coercion:?} cannot be converted to str"
            ))
            .into())
        }
        .into_pyresult()
    }
}


================================================
FILE: src/types/excelreader/mod.rs
================================================
#[cfg(feature = "python")]
mod python;

use std::{
    fs::File,
    io::{BufReader, Cursor},
};

use calamine::{
    Data, HeaderRow, Range, Reader, Sheet as CalamineSheet, Sheets, Table, open_workbook_auto,
    open_workbook_auto_from_rs,
};
#[cfg(feature = "python")]
use calamine::{DataRef, ReaderRef};
#[cfg(feature = "python")]
use pyo3::pyclass;

use crate::{
    ExcelSheet, ExcelTable,
    error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult},
    types::{
        dtype::{DTypeCoercion, DTypes},
        excelsheet::{SelectedColumns, SkipRows},
        idx_or_name::IdxOrName,
    },
};

use super::excelsheet::table::{extract_table_names, extract_table_range};

enum ExcelSheets {
    File(Sheets<BufReader<File>>),
    Bytes(Sheets<Cursor<Vec<u8>>>),
}

impl ExcelSheets {
    fn worksheet_range(&mut self, name: &str) -> FastExcelResult<Range<Data>> {
        match self {
            Self::File(sheets) => sheets.worksheet_range(name),
            Self::Bytes(sheets) => sheets.worksheet_range(name),
        }
        .map_err(|err| FastExcelErrorKind::CalamineError(err).into())
        .with_context(|| format!("Error while loading sheet {name}"))
    }

    #[allow(dead_code)]
    fn sheet_metadata(&self) -> &[CalamineSheet] {
        match self {
            ExcelSheets::File(sheets) => sheets.sheets_metadata(),
            ExcelSheets::Bytes(sheets) => sheets.sheets_metadata(),
        }
    }

    fn table_names(&mut self, sheet_name: Option<&str>) -> FastExcelResult<Vec<&str>> {
        let names = match self {
            Self::File(sheets) => extract_table_names(sheets, sheet_name),
            Self::Bytes(sheets) => extract_table_names(sheets, sheet_name),
        }?;
        Ok(names.into_iter().map(String::as_str).collect())
    }

    fn defined_names(&mut self) -> FastExcelResult<Vec<DefinedName>> {
        let defined_names = match self {
            Self::File(sheets) => sheets.defined_names(),
            Self::Bytes(sheets) => sheets.defined_names(),
        }
        .to_vec()
        .into_iter()
        .map(|(name, formula)| DefinedName { name, formula })
        .collect();
        Ok(defined_names)
    }

    #[cfg(feature = "python")]
    fn supports_by_ref(&self) -> bool {
        matches!(
            self,
            Self::File(Sheets::Xlsx(_)) | Self::Bytes(Sheets::Xlsx(_))
        )
    }

    fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
        match self {
            Self::File(sheets) => {
                sheets.with_header_row(header_row);
                self
            }
            Self::Bytes(sheets) => {
                sheets.with_header_row(header_row);
                self
            }
        }
    }

    #[cfg(feature = "python")]
    fn worksheet_range_ref(&mut self, name: &str) -> FastExcelResult<Range<DataRef<'_>>> {
        match self {
            ExcelSheets::File(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?),
            ExcelSheets::Bytes(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?),
            _ => Err(FastExcelErrorKind::Internal(
                "sheets do not support worksheet_range_ref".to_string(),
            )
            .into()),
        }
        .with_context(|| format!("Error while loading sheet {name}"))
    }

    fn get_table(&mut self, name: &str) -> FastExcelResult<Table<Data>> {
        match self {
            Self::File(sheets) => extract_table_range(name, sheets),
            Self::Bytes(sheets) => extract_table_range(name, sheets),
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "python", pyclass(name = "DefinedName", skip_from_py_object))]
pub struct DefinedName {
    pub name: String,
    pub formula: String,
}

/// Options for loading a sheet or table.
#[non_exhaustive]
#[derive(Debug)]
pub struct LoadSheetOrTableOptions {
    /// The index of the row containing the column labels. If `None`, the provided headers are used.
    /// Any row before the header row is skipped.
    pub header_row: Option<usize>,
    /// The column names to use. If `None`, the column names are inferred from the header row.
    pub column_names: Option<Vec<String>>,
    /// How rows should be skipped.
    pub skip_rows: SkipRows,
    /// The number of rows to read. If `None`, all rows are read.
    pub n_rows: Option<usize>,
    /// The number of rows to sample for schema inference. If `None`, all rows are sampled.
    pub schema_sample_rows: Option<usize>,
    /// How data types should be coerced.
    pub dtype_coercion: DTypeCoercion,
    /// The columns to select.
    pub selected_columns: SelectedColumns,
    /// Override the inferred data types.
    pub dtypes: Option<DTypes>,
    /// Skip rows at the end of the sheet/table containing only whitespace and null values.
    pub skip_whitespace_tail_rows: bool,
    /// Consider cells containing only whitespace as null values.
    pub whitespace_as_null: bool,
}

impl LoadSheetOrTableOptions {
    /// Returns a `calamine::HeaderRow`, indicating the first row of the range to be read. For us,
    /// `header_row` can be `None` (meaning there is no header and we should start reading the data
    /// at the beginning of the sheet)
    fn calamine_header_row(&self) -> HeaderRow {
        match (self.header_row, &self.skip_rows) {
            (None | Some(0), SkipRows::SkipEmptyRowsAtBeginning) => HeaderRow::FirstNonEmptyRow,
            (None, _) => HeaderRow::Row(0),
            (Some(row), _) => HeaderRow::Row(row as u32),
        }
    }

    /// Returns the row number of the first data row to read, if defined
    pub(crate) fn data_header_row(&self) -> Option<usize> {
        self.header_row.and(Some(0))
    }

    /// Returns a new `LoadSheetOrTableOptions` instance for loading a sheet. `header_row` is set to
    /// `Some(0)`
    pub fn new_for_sheet() -> Self {
        Self {
            header_row: Some(0),
            column_names: Default::default(),
            skip_rows: Default::default(),
            n_rows: Default::default(),
            schema_sample_rows: Default::default(),
            dtype_coercion: Default::default(),
            selected_columns: Default::default(),
            dtypes: Default::default(),
            skip_whitespace_tail_rows: Default::default(),
            whitespace_as_null: Default::default(),
        }
    }

    /// Returns a new `LoadSheetOrTableOptions` instance for loading a sheet. `header_row` is set to
    /// `None`
    pub fn new_for_table() -> Self {
        Self {
            header_row: None,
            column_names: Default::default(),
            skip_rows: Default::default(),
            n_rows: Default::default(),
            schema_sample_rows: Default::default(),
            dtype_coercion: Default::default(),
            selected_columns: Default::default(),
            dtypes: Default::default(),
            skip_whitespace_tail_rows: Default::default(),
            whitespace_as_null: Default::default(),
        }
    }

    pub fn header_row(mut self, header_row: usize) -> Self {
        self.header_row = Some(header_row);
        self
    }

    pub fn no_header_row(mut self) -> Self {
        self.header_row = None;
        self
    }

    pub fn column_names<I: IntoIterator<Item = impl Into<String>>>(
        mut self,
        column_names: I,
    ) -> Self {
        self.column_names = Some(column_names.into_iter().map(Into::into).collect());
        self
    }

    pub fn skip_rows(mut self, skip_rows: SkipRows) -> Self {
        self.skip_rows = skip_rows;
        self
    }

    pub fn n_rows(mut self, n_rows: usize) -> Self {
        self.n_rows = Some(n_rows);
        self
    }

    pub fn schema_sample_rows(mut self, schema_sample_rows: usize) -> Self {
        self.schema_sample_rows = Some(schema_sample_rows);
        self
    }

    pub fn dtype_coercion(mut self, dtype_coercion: DTypeCoercion) -> Self {
        self.dtype_coercion = dtype_coercion;
        self
    }

    pub fn selected_columns(mut self, selected_columns: SelectedColumns) -> Self {
        self.selected_columns = selected_columns;
        self
    }

    pub fn with_dtypes(mut self, dtypes: DTypes) -> Self {
        self.dtypes = Some(dtypes);
        self
    }

    pub fn skip_whitespace_tail_rows(mut self, skip_whitespace_tail_rows: bool) -> Self {
        self.skip_whitespace_tail_rows = skip_whitespace_tail_rows;
        self
    }

    pub fn whitespace_as_null(mut self, whitespace_as_null: bool) -> Self {
        self.whitespace_as_null = whitespace_as_null;
        self
    }
}

/// Represents an open Excel file and allows to access its sheets and tables.
#[cfg_attr(feature = "python", pyclass(name = "_ExcelReader"))]
pub struct ExcelReader {
    sheets: ExcelSheets,
    sheet_metadata: Vec<CalamineSheet>,
    #[cfg(feature = "python")]
    source: String,
}

impl ExcelReader {
    // NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed
    // string, but rather from the file pointed by it. Semantically, try_from_path is clearer
    pub(crate) fn try_from_path(path: &str) -> FastExcelResult<Self> {
        let sheets = open_workbook_auto(path)
            .map_err(|err| FastExcelErrorKind::CalamineError(err).into())
            .with_context(|| format!("Could not open workbook at {path}"))?;
        let sheet_metadata = sheets.sheets_metadata().to_owned();
        Ok(Self {
            sheets: ExcelSheets::File(sheets),
            sheet_metadata,
            #[cfg(feature = "python")]
            source: path.to_owned(),
        })
    }

    fn find_sheet_meta(&self, idx_or_name: IdxOrName) -> FastExcelResult<&CalamineSheet> {
        match idx_or_name {
            IdxOrName::Name(name) => {
                if let Some(sheet) = self.sheet_metadata.iter().find(|s| s.name == name) {
                    Ok(sheet)
                } else {
                    Err(FastExcelErrorKind::SheetNotFound(IdxOrName::Name(name.clone())).into()).with_context(||  {
                        let available_sheets = self.sheet_metadata.iter().map(|s| format!("\"{}\"", s.name)).collect::<Vec<_>>().join(", ");
                        format!(
                            "Sheet \"{name}\" not found in file. Available sheets: {available_sheets}."
                        )
                    })
                }
            }
            IdxOrName::Idx(idx) => self
                .sheet_metadata
                .get(idx)
                .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into())
                .with_context(|| {
                    format!(
                        "Sheet index {idx} is out of range. File has {} sheets.",
                        self.sheet_metadata.len()
                    )
                }),
        }
    }

    /// Load a sheet from the Excel file.
    pub fn load_sheet(
        &mut self,
        idx_or_name: IdxOrName,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<ExcelSheet> {
        let calamine_header_row = opts.calamine_header_row();

        let sheet_meta = self.find_sheet_meta(idx_or_name)?.to_owned();

        let range = self
            .sheets
            .with_header_row(calamine_header_row)
            .worksheet_range(&sheet_meta.name)?;

        ExcelSheet::try_new(sheet_meta, range.into(), opts)
    }

    /// Load a table from the Excel file.
    pub fn load_table(
        &mut self,
        name: &str,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<ExcelTable> {
        let table = self.sheets.get_table(name)?;
        ExcelTable::try_new(table, opts)
    }

    pub fn sheet_names(&self) -> Vec<&str> {
        self.sheet_metadata
            .iter()
            .map(|s| s.name.as_str())
            .collect()
    }

    pub fn table_names(&mut self, sheet_name: Option<&str>) -> FastExcelResult<Vec<&str>> {
        self.sheets.table_names(sheet_name)
    }

    pub fn defined_names(&mut self) -> FastExcelResult<Vec<DefinedName>> {
        self.sheets.defined_names()
    }
}

impl TryFrom<&[u8]> for ExcelReader {
    type Error = FastExcelError;

    fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
        let cursor = Cursor::new(bytes.to_vec());
        let sheets = open_workbook_auto_from_rs(cursor)
            .map_err(|err| FastExcelErrorKind::CalamineError(err).into())
            .with_context(|| "Could not open workbook from bytes")?;
        let sheet_metadata = sheets.sheets_metadata().to_owned();
        Ok(Self {
            sheets: ExcelSheets::Bytes(sheets),
            sheet_metadata,
            #[cfg(feature = "python")]
            source: "bytes".to_owned(),
        })
    }
}


================================================
FILE: src/types/excelreader/python.rs
================================================
use arrow_array::RecordBatch;
use pyo3::{Bound, IntoPyObjectExt, PyAny, PyResult, Python, pymethods, types::PyString};

use super::{DefinedName, ExcelReader};

use crate::{
    ExcelSheet,
    data::{ExcelSheetData, record_batch_from_data_and_columns},
    error::{ErrorContext, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult},
    types::{
        dtype::{DTypeCoercion, DTypes},
        excelreader::LoadSheetOrTableOptions,
        excelsheet::{
            Header, Pagination, SelectedColumns, SkipRows,
            column_info::{build_available_columns_info, finalize_column_info},
        },
        idx_or_name::IdxOrName,
    },
    utils::schema::get_schema_sample_rows,
};

impl ExcelReader {
    fn build_selected_columns(
        use_columns: Option<&Bound<'_, PyAny>>,
    ) -> FastExcelResult<SelectedColumns> {
        use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}"))
    }

    fn load_sheet_eager(
        data: &ExcelSheetData,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<RecordBatch> {
        let data_header_row = opts.data_header_row();
        let pagination = match &data {
            ExcelSheetData::Owned(range) => {
                Pagination::try_new(opts.skip_rows, opts.n_rows, range)?
            }
            ExcelSheetData::Ref(range) => Pagination::try_new(opts.skip_rows, opts.n_rows, range)?,
        };

        let header = Header::new(data_header_row, opts.column_names);

        let offset = header.offset() + pagination.offset();
        let limit = {
            let upper_bound = data.height();
            if let Some(n_rows) = pagination.n_rows() {
                // minimum value between (offset+n_rows) and the data's height
                std::cmp::min(offset + n_rows, upper_bound)
            } else {
                upper_bound
            }
        };

        let sample_rows_limit = get_schema_sample_rows(opts.schema_sample_rows, offset, limit);
        let available_columns_info =
            build_available_columns_info(data, &opts.selected_columns, &header)?;
        let final_columns_info = opts
            .selected_columns
            .select_columns(available_columns_info)?;

        let available_columns = finalize_column_info(
            final_columns_info,
            data,
            offset,
            sample_rows_limit,
            opts.dtypes.as_ref(),
            &opts.dtype_coercion,
            opts.whitespace_as_null,
        )?;

        match data {
            ExcelSheetData::Owned(data) => record_batch_from_data_and_columns(
                &available_columns,
                data,
                offset,
                limit,
                opts.whitespace_as_null,
            ),
            ExcelSheetData::Ref(data) => record_batch_from_data_and_columns(
                &available_columns,
                data,
                offset,
                limit,
                opts.whitespace_as_null,
            ),
        }
    }

    fn build_sheet<'py>(
        &mut self,
        idx_or_name: IdxOrName,
        opts: LoadSheetOrTableOptions,
        eager: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        let calamine_header_row = opts.calamine_header_row();

        let sheet_meta = self
            .find_sheet_meta(idx_or_name)
            .into_pyresult()?
            .to_owned();

        if eager && self.sheets.supports_by_ref() {
            let range = py
                .detach(|| {
                    self.sheets
                        .with_header_row(calamine_header_row)
                        .worksheet_range_ref(&sheet_meta.name)
                })
                .into_pyresult()?;
            let rb = py
                .detach(|| Self::load_sheet_eager(&range.into(), opts))
                .into_pyresult()?;

            #[cfg(feature = "pyarrow")]
            {
                use arrow_pyarrow::ToPyArrow;
                rb.to_pyarrow(py)
            }
            #[cfg(not(feature = "pyarrow"))]
            {
                Err(pyo3::exceptions::PyRuntimeError::new_err(
                    "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                ))
            }
        } else {
            let range = py
                .detach(|| {
                    self.sheets
                        .with_header_row(calamine_header_row)
                        .worksheet_range(&sheet_meta.name)
                })
                .into_pyresult()?;
            let sheet = ExcelSheet::try_new(sheet_meta, range.into(), opts).into_pyresult()?;

            if eager {
                #[cfg(feature = "pyarrow")]
                {
                    sheet.to_arrow(py)
                }
                #[cfg(not(feature = "pyarrow"))]
                {
                    Err(pyo3::exceptions::PyRuntimeError::new_err(
                        "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                    ))
                }
            } else {
                sheet.into_bound_py_any(py)
            }
        }
    }

    #[allow(clippy::too_many_arguments)]
    fn build_table<'py>(
        &mut self,
        name: &str,
        opts: LoadSheetOrTableOptions,
        eager: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        let excel_table = py.detach(|| self.load_table(name, opts)).into_pyresult()?;

        if eager {
            #[cfg(feature = "pyarrow")]
            {
                Ok(excel_table.to_arrow(py)?)
            }
            #[cfg(not(feature = "pyarrow"))]
            {
                Err(pyo3::exceptions::PyRuntimeError::new_err(
                    "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                ))
            }
        } else {
            excel_table.into_bound_py_any(py)
        }
    }
}

#[pymethods]
impl ExcelReader {
    pub fn __repr__(&self) -> String {
        format!("ExcelReader<{}>", &self.source)
    }

    #[pyo3(name = "table_names", signature = (sheet_name = None))]
    pub(crate) fn py_table_names(&mut self, sheet_name: Option<&str>) -> PyResult<Vec<&str>> {
        self.sheets.table_names(sheet_name).into_pyresult()
    }

    #[pyo3(name = "defined_names")]
    pub(crate) fn py_defined_names(&mut self) -> PyResult<Vec<DefinedName>> {
        self.defined_names().into_pyresult()
    }

    #[pyo3(name = "load_sheet", signature = (
        idx_or_name,
        *,
        header_row = 0,
        column_names = None,
        skip_rows = SkipRows::SkipEmptyRowsAtBeginning,
        n_rows = None,
        schema_sample_rows = 1_000,
        dtype_coercion = DTypeCoercion::Coerce,
        use_columns = None,
        dtypes = None,
        eager = false,
        skip_whitespace_tail_rows = false,
        whitespace_as_null = false,
    ))]
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn py_load_sheet<'py>(
        &mut self,
        idx_or_name: &Bound<'py, PyAny>,
        header_row: Option<usize>,
        column_names: Option<Vec<String>>,
        skip_rows: SkipRows,
        n_rows: Option<usize>,
        schema_sample_rows: Option<usize>,
        dtype_coercion: DTypeCoercion,
        use_columns: Option<&Bound<'py, PyAny>>,
        dtypes: Option<DTypes>,
        eager: bool,
        skip_whitespace_tail_rows: bool,
        whitespace_as_null: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3
        if let Some(0) = schema_sample_rows {
            return Err(FastExcelErrorKind::InvalidParameters(
                "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(),
            )
            .into())
            .into_pyresult();
        }
        let idx_or_name = idx_or_name.try_into().into_pyresult()?;
        let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;
        let opts = LoadSheetOrTableOptions {
            header_row,
            column_names,
            skip_rows,
            n_rows,
            schema_sample_rows,
            dtype_coercion,
            selected_columns,
            dtypes,
            skip_whitespace_tail_rows,
            whitespace_as_null,
        };

        self.build_sheet(idx_or_name, opts, eager, py)
    }

    #[pyo3(name = "load_table", signature = (
        name,
        *,
        header_row = 0,
        column_names = None,
        skip_rows = SkipRows::SkipEmptyRowsAtBeginning,
        n_rows = None,
        schema_sample_rows = 1_000,
        dtype_coercion = DTypeCoercion::Coerce,
        use_columns = None,
        dtypes = None,
        eager = false,
        skip_whitespace_tail_rows = false,
        whitespace_as_null = false,
    ))]
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn py_load_table<'py>(
        &mut self,
        name: &Bound<'py, PyString>,
        header_row: Option<usize>,
        column_names: Option<Vec<String>>,
        skip_rows: SkipRows,
        n_rows: Option<usize>,
        schema_sample_rows: Option<usize>,
        dtype_coercion: DTypeCoercion,
        use_columns: Option<&Bound<'py, PyAny>>,
        dtypes: Option<DTypes>,
        eager: bool,
        skip_whitespace_tail_rows: bool,
        whitespace_as_null: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3
        if let Some(0) = schema_sample_rows {
            return Err(FastExcelErrorKind::InvalidParameters(
                "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(),
            )
            .into())
            .into_pyresult();
        }

        let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;
        let opts = LoadSheetOrTableOptions {
            header_row,
            column_names,
            skip_rows,
            n_rows,
            schema_sample_rows,
            dtype_coercion,
            selected_columns,
            dtypes,
            skip_whitespace_tail_rows,
            whitespace_as_null,
        };

        self.build_table(&name.to_string(), opts, eager, py)
    }

    #[getter("sheet_names")]
    pub(crate) fn py_sheet_names(&self) -> Vec<&str> {
        self.sheet_names()
    }
}

#[pymethods]
impl DefinedName {
    /// Creates a new `DefinedName` object.
    #[new]
    pub fn py_new(name: String, formula: String) -> Self {
        DefinedName { name, formula }
    }

    #[getter("name")]
    pub fn py_name(&self) -> &str {
        &self.name
    }

    #[getter("formula")]
    pub fn py_formula(&self) -> &str {
        &self.formula
    }

    pub fn __repr__(&self) -> String {
        format!(
            "DefinedName<{name} ({formula})>",
            name = &self.name,
            formula = self
                .formula
                .get(..10)
                .map(|s| format!("{}...", s))
                .as_deref()
                .unwrap_or(self.formula.as_str())
        )
    }

    pub fn __eq__(&self, other: &Self) -> bool {
        self == other
    }
}


================================================
FILE: src/types/excelsheet/column_info/mod.rs
================================================
#[cfg(feature = "python")]
mod python;

use std::{fmt::Display, str::FromStr};

use calamine::DataType;
#[cfg(feature = "python")]
use pyo3::pyclass;

use crate::{
    data::ExcelSheetData,
    error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult},
    types::{
        dtype::{DType, DTypeCoercion, DTypes, get_dtype_for_column},
        idx_or_name::IdxOrName,
    },
};

use super::{Header, SelectedColumns};

/// How the column name was determined
#[derive(Debug, Clone, PartialEq)]
pub enum ColumnNameFrom {
    /// The column name was provided by the user.
    Provided,
    /// The column name was looked up in the sheet or table.
    LookedUp,
    /// The column name was generated based on the column index.
    Generated,
}

impl FromStr for ColumnNameFrom {
    type Err = FastExcelError;

    fn from_str(s: &str) -> FastExcelResult<Self> {
        match s {
            "provided" => Ok(Self::Provided),
            "looked_up" => Ok(Self::LookedUp),
            "generated" => Ok(Self::Generated),
            _ => Err(
                FastExcelErrorKind::InvalidParameters(format!("invalid ColumnNameFrom: {s}"))
                    .into(),
            ),
        }
    }
}

impl Display for ColumnNameFrom {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(match self {
            ColumnNameFrom::Provided => "provided",
            ColumnNameFrom::LookedUp => "looked_up",
            ColumnNameFrom::Generated => "generated",
        })
    }
}

/// How the data type was determined.
#[derive(Debug, Clone, PartialEq)]
pub enum DTypeFrom {
    /// The data type was provided for all columns.
    ProvidedForAll,
    /// The data type was provided via the column's index.
    ProvidedByIndex,
    /// The data type was provided via the column's name.
    ProvidedByName,
    /// The data type was guessed based on the column's data.
    Guessed,
}

impl Display for DTypeFrom {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(match self {
            DTypeFrom::ProvidedForAll => "provided_for_all",
            DTypeFrom::ProvidedByIndex => "provided_by_index",
            DTypeFrom::ProvidedByName => "provided_by_name",
            DTypeFrom::Guessed => "guessed",
        })
    }
}

impl FromStr for DTypeFrom {
    type Err = FastExcelError;

    fn from_str(s: &str) -> FastExcelResult<Self> {
        match s {
            "provided_for_all" => Ok(Self::ProvidedForAll),
            "provided_by_index" => Ok(Self::ProvidedByIndex),
            "provided_by_name" => Ok(Self::ProvidedByName),
            "guessed" => Ok(Self::Guessed),
            _ => Err(
                FastExcelErrorKind::InvalidParameters(format!("invalid DTypesFrom: {s}")).into(),
            ),
        }
    }
}

// NOTE: The types for properties unfortunately do not appear in the docs for this class, so we had
// to specify them via docstrings
/// Metadata about a single column in a sheet.
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(feature = "python", pyclass(name = "ColumnInfo", skip_from_py_object))]
pub struct ColumnInfo {
    /// The column's name
    pub name: String,
    /// The column's index
    pub index: usize,
    /// The column's absolute index
    pub absolute_index: usize,
    /// The column's data type
    pub dtype: DType,
    /// How the column name was determined
    pub column_name_from: ColumnNameFrom,
    /// How the column data type was determined
    pub dtype_from: DTypeFrom,
}

impl ColumnInfo {
    pub(crate) fn new(
        name: String,
        index: usize,
        absolute_index: usize,
        column_name_from: ColumnNameFrom,
        dtype: DType,
        dtype_from: DTypeFrom,
    ) -> Self {
        Self {
            name,
            index,
            absolute_index,
            dtype,
            column_name_from,
            dtype_from,
        }
    }
}

/// This class provides information about a single column in a sheet, without associated type
/// information
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(
    feature = "python",
    pyclass(name = "ColumnInfoNoDtype", skip_from_py_object)
)]
pub(crate) struct ColumnInfoNoDtype {
    name: String,
    index: usize,
    absolute_index: usize,
    column_name_from: ColumnNameFrom,
}

// Allows us to easily compare ourselves to a column index or name
impl PartialEq<IdxOrName> for ColumnInfoNoDtype {
    fn eq(&self, other: &IdxOrName) -> bool {
        match other {
            IdxOrName::Idx(index) => index == &self.index,
            IdxOrName::Name(name) => name == &self.name,
        }
    }
}

impl ColumnInfoNoDtype {
    pub(super) fn new(
        name: String,
        index: usize,
        absolute_index: usize,
        column_name_from: ColumnNameFrom,
    ) -> Self {
        Self {
            name,
            index,
            absolute_index,
            column_name_from,
        }
    }

    pub(super) fn with_name(mut self, name: String) -> Self {
        self.name = name;
        self
    }

    pub(super) fn name(&self) -> &str {
        &self.name
    }

    pub(super) fn absolute_index(&self) -> usize {
        self.absolute_index
    }

    fn dtype_info<D: CalamineDataProvider>(
        &self,
        data: &D,
        start_row: usize,
        end_row: usize,
        specified_dtypes: Option<&DTypes>,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<(DType, DTypeFrom)> {
        specified_dtypes
            .and_then(|dtypes| {
                match dtypes {
                    DTypes::All(dtype) => Some((*dtype, DTypeFrom::ProvidedForAll)),
                    DTypes::Map(dtypes) => {
                        // if we have dtypes, look the dtype up by index, and fall back on a lookup by name
                        // (done in this order because copying an usize is cheaper than cloning a string)
                        if let Some(dtype) = dtypes.get(&self.absolute_index().into()) {
                            Some((*dtype, DTypeFrom::ProvidedByIndex))
                        } else {
                            dtypes
                                .get(&self.name.clone().into())
                                .map(|dtype| (*dtype, DTypeFrom::ProvidedByName))
                        }
                    }
                }
            })
            .map(FastExcelResult::Ok)
            // If we could not look up a dtype, guess it from the data
            .unwrap_or_else(|| {
                data.dtype_for_column(
                    start_row,
                    end_row,
                    self.index,
                    dtype_coercion,
                    whitespace_as_null,
                )
                .map(|dtype| (dtype, DTypeFrom::Guessed))
            })
    }

    pub(super) fn finish<D: CalamineDataProvider>(
        self,
        data: &D,
        start_row: usize,
        end_row: usize,
        specified_dtypes: Option<&DTypes>,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<ColumnInfo> {
        let (dtype, dtype_from) = self
            .dtype_info(
                data,
                start_row,
                end_row,
                specified_dtypes,
                dtype_coercion,
                whitespace_as_null,
            )
            .with_context(|| format!("could not determine dtype for column {}", self.name))?;
        Ok(ColumnInfo::new(
            self.name,
            self.index,
            self.absolute_index,
            self.column_name_from,
            dtype,
            dtype_from,
        ))
    }
}

pub(crate) trait CalamineDataProvider {
    fn width(&self) -> usize;
    fn get_as_string(&self, pos: (usize, usize)) -> Option<String>;
    fn dtype_for_column(
        &self,
        start_row: usize,
        end_row: usize,
        col: usize,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<DType>;
    fn start(&self) -> Option<(usize, usize)>;
}

impl CalamineDataProvider for ExcelSheetData<'_> {
    fn width(&self) -> usize {
        self.width()
    }

    fn get_as_string(&self, pos: (usize, usize)) -> Option<String> {
        self.get_as_string(pos)
    }

    fn dtype_for_column(
        &self,
        start_row: usize,
        end_row: usize,
        col: usize,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<DType> {
        self.dtype_for_column(start_row, end_row, col, dtype_coercion, whitespace_as_null)
    }

    fn start(&self) -> Option<(usize, usize)> {
        self.start()
    }
}

impl CalamineDataProvider for calamine::Range<calamine::Data> {
    fn width(&self) -> usize {
        self.width()
    }

    fn get_as_string(&self, pos: (usize, usize)) -> Option<String> {
        self.get(pos).and_then(|data| data.as_string())
    }

    fn dtype_for_column(
        &self,
        start_row: usize,
        end_row: usize,
        col: usize,
        dtype_coercion: &DTypeCoercion,
        whitespace_as_null: bool,
    ) -> FastExcelResult<DType> {
        get_dtype_for_column(
            self,
            start_row,
            end_row,
            col,
            dtype_coercion,
            whitespace_as_null,
        )
    }
    fn start(&self) -> Option<(usize, usize)> {
        self.start().map(|(r, c)| (r as usize, c as usize))
    }
}

fn column_info_from_header<D: CalamineDataProvider>(
    data: &D,
    selected_columns: &SelectedColumns,
    header: &Header,
) -> FastExcelResult<Vec<ColumnInfoNoDtype>> {
    let width = data.width();
    let (_, col_off) = data.start().unwrap_or((0, 0));
    match header {
        Header::None => Ok((0..width)
            .map(|col_idx| {
                ColumnInfoNoDtype::new(
                    format!("__UNNAMED__{col_idx}"),
                    col_idx,
                    col_off + col_idx,
                    ColumnNameFrom::Generated,
                )
            })
            .collect()),
        Header::At(row_idx) => Ok((0..width)
            .map(|col_idx| {
                data.get_as_string((*row_idx, col_idx))
                    .map(|col_name| {
                        // Remove null bytes from column names to avoid CString panics in Arrow FFI.
                        //
                        // Excel strings (especially UTF-16 in .xls) may contain embedded nulls (`\0`) after
                        // conversion to Rust `String`. Arrow’s C FFI uses `CString::new()`, which fails on
                        // null bytes, causing panics.
                        //
                        // This strips nulls while keeping the readable content.
                        let sanitized_col_name = col_name.replace('\0', "");
                        ColumnInfoNoDtype::new(
                            sanitized_col_name,
                            col_idx,
                            col_off + col_idx,
                            ColumnNameFrom::LookedUp,
                        )
                    })
                    .unwrap_or_else(|| {
                        ColumnInfoNoDtype::new(
                            format!("__UNNAMED__{col_idx}"),
                            col_idx,
                            col_off + col_idx,
                            ColumnNameFrom::Generated,
                        )
                    })
            })
            .collect()),
        Header::With(names) => {
            if let SelectedColumns::Selection(column_selection) = selected_columns {
                if column_selection.len() != names.len() {
                    return Err(FastExcelErrorKind::InvalidParameters(
                        "column_names and use_columns must have the same length when a header is provided".to_string(),
                    )
                    .into());
                }
                let selected_indices = column_selection
                        .iter()
                        .map(|idx_or_name| {
                            match idx_or_name {
                        IdxOrName::Idx(idx) => Ok(*idx),
                        IdxOrName::Name(name) => Err(FastExcelErrorKind::InvalidParameters(
                            format!("use_columns can only contain integers when used with columns_names, got \"{name}\"")
                        )
                        .into()),
                    }
                        })
                        .collect::<FastExcelResult<Vec<_>>>()?;

                Ok((0..width)
                    .map(|col_idx| {
                        let absolute_col_idx = col_idx + col_off;
                        let provided_name_opt = if let Some(pos_in_names) = selected_indices
                            .iter()
                            .position(|idx| *idx == absolute_col_idx)
                        {
                            names.get(pos_in_names).cloned()
                        } else {
                            None
                        };

                        match provided_name_opt {
                            Some(provided_name) => ColumnInfoNoDtype::new(
                                provided_name,
                                col_idx,
                                col_off + col_idx,
                                ColumnNameFrom::Provided,
                            ),
                            None => ColumnInfoNoDtype::new(
                                format!("__UNNAMED__{col_idx}"),
                                col_idx,
                                col_off + col_idx,
                                ColumnNameFrom::Generated,
                            ),
                        }
                    })
                    .collect())
            } else {
                let nameless_start_idx = names.len();
                Ok(names
                    .iter()
                    .enumerate()
                    .map(|(col_idx, name)| {
                        ColumnInfoNoDtype::new(
                            name.to_owned(),
                            col_idx,
                            col_off + col_idx,
                            ColumnNameFrom::Provided,
                        )
                    })
                    .chain((nameless_start_idx..width).map(|col_idx| {
                        ColumnInfoNoDtype::new(
                            format!("__UNNAMED__{col_idx}"),
                            col_idx,
                            col_off + col_idx,
                            ColumnNameFrom::Generated,
                        )
                    }))
                    .collect())
            }
        }
    }
}

/// Loads available columns and sets aliases in case of name conflicts
pub(crate) fn build_available_columns_info<D: CalamineDataProvider>(
    data: &D,
    selected_columns: &SelectedColumns,
    header: &Header,
) -> FastExcelResult<Vec<ColumnInfoNoDtype>> {
    column_info_from_header(data, selected_columns, header).map(set_aliases_for_columns_info)
}

fn set_aliases_for_columns_info(columns_info: Vec<ColumnInfoNoDtype>) -> Vec<ColumnInfoNoDtype> {
    let mut aliased_column_names = Vec::with_capacity(columns_info.len());
    columns_info
        .into_iter()
        .map(|mut column_info_builder| {
            // Setting the right alias for every column
            let alias = alias_for_name(column_info_builder.name(), &aliased_column_names);
            if alias != column_info_builder.name() {
                column_info_builder = column_info_builder.with_name(alias.clone());
            }
            aliased_column_names.push(alias);
            column_info_builder
        })
        .collect()
}

fn alias_for_name(name: &str, existing_names: &[String]) -> String {
    #[inline]
    fn rec(name: &str, existing_names: &[String], depth: usize) -> String {
        let alias = if depth == 0 {
            name.to_owned()
        } else {
            format!("{name}_{depth}")
        };
        match existing_names
            .iter()
            .any(|existing_name| existing_name == &alias)
        {
            true => rec(name, existing_names, depth + 1),
            false => alias,
        }
    }

    rec(name, existing_names, 0)
}

/// Turns `ColumnInfoNoDtype` into `ColumnInfo`. This will determine the right dtype when needed
pub(crate) fn finalize_column_info<D: CalamineDataProvider>(
    available_columns_info: Vec<ColumnInfoNoDtype>,
    data: &D,
    start_row: usize,
    end_row: usize,
    specified_dtypes: Option<&DTypes>,
    dtype_coercion: &DTypeCoercion,
    whitespace_as_null: bool,
) -> FastExcelResult<Vec<ColumnInfo>> {
    available_columns_info
        .into_iter()
        .map(|column_info_builder| {
            column_info_builder.finish(
                data,
                start_row,
                end_row,
                specified_dtypes,
                dtype_coercion,
                whitespace_as_null,
            )
        })
        .collect()
}

#[derive(Debug)]
pub(crate) enum AvailableColumns {
    Pending,
    Loaded(Vec<ColumnInfo>),
}

impl AvailableColumns {
    pub(crate) fn as_loaded(&self) -> FastExcelResult<&[ColumnInfo]> {
        match self {
            AvailableColumns::Loaded(column_infos) => Ok(column_infos),
            AvailableColumns::Pending => Err(FastExcelErrorKind::Internal(format!(
                "Expected available columns to be loaded, got {self:?}. \
                    This is a bug, please report it to the fastexcel repository"
            ))
            .into()),
        }
    }
}


================================================
FILE: src/types/excelsheet/column_info/python.rs
================================================
use arrow_schema::Field;
use pyo3::{PyResult, pymethods};

use crate::{
    error::py_errors::IntoPyResult,
    types::excelsheet::column_info::{ColumnInfo, ColumnInfoNoDtype},
};

impl From<&ColumnInfo> for Field {
    fn from(col_info: &ColumnInfo) -> Self {
        Field::new(&col_info.name, (&col_info.dtype).into(), true)
    }
}

#[pymethods]
impl ColumnInfo {
    /// Creates a new ColumnInfo object.
    ///
    /// - `name`: `str`. The name of the column
    /// - `index`: `int`. The index of the column. Must be >=0
    /// - `absolute_index`: `int`. The absolute index of the column. Must be >=0
    /// - `column_name_from`: `fastexcel.ColumnNameFrom`. The origin of the column name
    /// - `dtype`: `fastexcel.DType`. The dtype of the column
    /// - `dtype_from`: `fastexcel.DTypeFrom`. The origin of the dtype for the column
    #[new]
    pub(crate) fn py_new(
        name: String,
        index: usize,
        absolute_index: usize,
        column_name_from: &str,
        dtype: &str,
        dtype_from: &str,
    ) -> PyResult<Self> {
        Ok(Self::new(
            name,
            index,
            absolute_index,
            column_name_from.parse().into_pyresult()?,
            dtype.parse().into_pyresult()?,
            dtype_from.parse().into_pyresult()?,
        ))
    }

    /// `fastexcel.DType`. The dtype of the column
    #[getter(dtype)]
    fn get_dtype(&self) -> String {
        self.dtype.to_string()
    }

    #[getter("name")]
    /// `str`. The name of the column
    pub fn py_name(&self) -> &str {
        &self.name
    }

    #[getter("index")]
    /// `int`. The index of the column
    pub fn py_index(&self) -> usize {
        self.index
    }

    #[getter("absolute_index")]
    /// `int`. The absolute index of the column
    pub fn py_absolute_index(&self) -> usize {
        self.absolute_index
    }

    /// `fastexcel.ColumnNameFrom`. How the name of the column was determined.
    ///
    /// One of three possible values:
    /// - `"provided"`: The column name was provided via the `use_columns` parameter
    /// - `"looked_up"`: The column name was looked up from the data found in the sheet
    /// - `"generated"`: The column name was generated from the column index, either because
    ///                  `header_row` was `None`, or because it could not be looked up
    #[getter(column_name_from)]
    fn get_colum_name_from(&self) -> String {
        self.column_name_from.to_string()
    }

    /// `fastexcel.DTypeFrom`. How the dtype of the column was determined.
    ///
    /// One of three possible values:
    /// - `"provided_by_index"`: The dtype was specified via the column index
    /// - `"provided_by_name"`: The dtype was specified via the column name
    /// - `"guessed"`: The dtype was determined from the content of the column
    #[getter(dtype_from)]
    fn get_dtype_from(&self) -> String {
        self.dtype_from.to_string()
    }

    pub fn __repr__(&self) -> String {
        format!(
            "ColumnInfo(name=\"{name}\", index={index}, absolute_index={absolute_index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" )",
            name = self.name,
            index = self.index,
            absolute_index = self.absolute_index,
            dtype = self.dtype,
            dtype_from = self.dtype_from,
            column_name_from = self.column_name_from
        )
    }

    pub fn __eq__(&self, other: &Self) -> bool {
        self == other
    }
}

#[pymethods]
impl ColumnInfoNoDtype {
    #[getter("name")]
    /// `str`. The name of the column
    pub fn py_name(&self) -> &str {
        &self.name
    }

    #[getter("index")]
    /// `int`. The index of the column
    pub fn py_index(&self) -> usize {
        self.index
    }

    #[getter("absolute_index")]
    /// `int`. The absolute index of the column
    pub fn py_absolute_index(&self) -> usize {
        self.absolute_index
    }
}


================================================
FILE: src/types/excelsheet/mod.rs
================================================
pub(crate) mod column_info;
#[cfg(feature = "polars")]
mod polars;
#[cfg(feature = "python")]
mod python;
pub(crate) mod table;

#[cfg(feature = "python")]
use std::sync::Arc;
use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr};

use calamine::{CellType, Range, Sheet as CalamineSheet, SheetVisible as CalamineSheetVisible};
use column_info::{AvailableColumns, ColumnInfoNoDtype};
#[cfg(feature = "polars")]
use polars_core::frame::DataFrame;
#[cfg(feature = "python")]
use pyo3::{Py, PyAny, Python, pyclass};

use self::column_info::{ColumnInfo, build_available_columns_info, finalize_column_info};
use crate::utils::schema::get_schema_sample_rows;
use crate::{
    LoadSheetOrTableOptions,
    data::{ExcelSheetData, FastExcelColumn},
    error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult},
    types::{dtype::DTypes, idx_or_name::IdxOrName},
};
#[cfg(feature = "python")]
pub(crate) use python::{CellError, CellErrors};

#[derive(Debug)]
pub(crate) enum Header {
    None,
    At(usize),
    With(Vec<String>),
}

impl Header {
    pub(crate) fn new(header_row: Option<usize>, column_names: Option<Vec<String>>) -> Self {
        match column_names {
            Some(headers) => Header::With(headers),
            None => match header_row {
                Some(row) => Header::At(row),
                None => Header::None,
            },
        }
    }

    pub(crate) fn offset(&self) -> usize {
        match self {
            Header::At(index) => index + 1,
            Header::None => 0,
            Header::With(_) => 0,
        }
    }
}

#[derive(Debug, Clone)]
#[cfg_attr(not(feature = "python"), derive(PartialEq, Eq))]
pub(crate) struct Pagination {
    skip_rows: SkipRows,
    n_rows: Option<usize>,
}

/// How rows should be skipped.
#[derive(Debug, Default, Clone)]
#[cfg_attr(not(feature = "python"), derive(PartialEq, Eq))]
pub enum SkipRows {
    /// Skip a fixed number of rows.
    Simple(usize),
    /// Skip rows based on a list of row indices.
    List(HashSet<usize>),
    #[cfg(feature = "python")]
    Callable(Arc<Py<PyAny>>),
    /// Skip empty rows at the beginning of the file (default).
    #[default]
    SkipEmptyRowsAtBeginning,
}

impl SkipRows {
    pub(crate) fn simple_offset(&self) -> Option<usize> {
        match self {
            SkipRows::Simple(offset) => Some(*offset),
            SkipRows::SkipEmptyRowsAtBeginning => Some(0), // Let calamine's FirstNonEmptyRow handle it
            _ => None,
        }
    }
}

impl Pagination {
    pub(crate) fn try_new<CT: CellType>(
        skip_rows: SkipRows,
        n_rows: Option<usize>,
        range: &Range<CT>,
    ) -> FastExcelResult<Self> {
        let max_height = range.height();
        // Only validate for simple skip_rows case
        if let SkipRows::Simple(skip_count) = &skip_rows {
            if max_height < *skip_count {
                return Err(FastExcelErrorKind::InvalidParameters(format!(
                    "Too many rows skipped. Max height is {max_height}"
                ))
                .into());
            }
        }
        Ok(Self { skip_rows, n_rows })
    }

    pub(crate) fn offset(&self) -> usize {
        self.skip_rows.simple_offset().unwrap_or(0)
    }

    pub(crate) fn n_rows(&self) -> Option<usize> {
        self.n_rows
    }

    pub(crate) fn skip_rows(&self) -> &SkipRows {
        &self.skip_rows
    }
}

#[derive(Default)]
pub enum SelectedColumns {
    #[default]
    All,
    Selection(Vec<IdxOrName>),
    #[cfg(feature = "python")]
    DynamicSelection(Py<PyAny>),
    DeferredSelection(Vec<DeferredColumnSelection>),
}

#[derive(Debug, Clone, PartialEq)]
pub enum DeferredColumnSelection {
    Fixed(IdxOrName),
    /// start column index, end is determined by sheet width
    OpenEndedRange(usize),
    /// end column index, start is 0
    FromBeginningRange(usize),
}

impl std::fmt::Debug for SelectedColumns {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::All => write!(f, "All"),
            Self::Selection(selection) => write!(f, "Selection({selection:?})"),
            #[cfg(feature = "python")]
            Self::DynamicSelection(func) => {
                let addr = func as *const _ as usize;
                write!(f, "DynamicSelection({addr})")
            }
            Self::DeferredSelection(deferred) => write!(f, "DeferredSelection({deferred:?})"),
        }
    }
}

impl PartialEq for SelectedColumns {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (Self::All, Self::All) => true,
            (Self::Selection(selection), Self::Selection(other_selection)) => {
                selection == other_selection
            }
            #[cfg(feature = "python")]
            (Self::DynamicSelection(f1), Self::DynamicSelection(f2)) => std::ptr::eq(f1, f2),
            (Self::DeferredSelection(deferred1), Self::DeferredSelection(deferred2)) => {
                deferred1 == deferred2
            }
            _ => false,
        }
    }
}

pub(crate) fn deferred_selection_to_concrete(
    deferred_selection: &[DeferredColumnSelection],
    max_col_index: usize,
) -> Vec<IdxOrName> {
    // First, resolve all deferred selections into concrete column indices
    let mut resolved_indices = Vec::new();

    for deferred in deferred_selection {
        match deferred {
            DeferredColumnSelection::Fixed(idx_or_name) => {
                resolved_indices.push(idx_or_name.clone());
            }
            DeferredColumnSelection::OpenEndedRange(start_idx) => {
                // Add all columns from start_idx to the end
                resolved_indices.extend((*start_idx..=max_col_index).map(IdxOrName::Idx));
            }
            DeferredColumnSelection::FromBeginningRange(end_idx) => {
                // Add all columns from 0 to end_idx (inclusive)
                let actual_end = (*end_idx).min(max_col_index);
                resolved_indices.extend((0..=actual_end).map(IdxOrName::Idx));
            }
        }
    }

    resolved_indices
}

impl SelectedColumns {
    pub(super) fn select_columns(
        &self,
        available_columns: Vec<ColumnInfoNoDtype>,
    ) -> FastExcelResult<Vec<ColumnInfoNoDtype>> {
        match self {
            SelectedColumns::All => Ok(available_columns),
            SelectedColumns::Selection(selection) => {
                let selected_indices: Vec<usize> = selection
                    .iter()
                    .map(|selected_column| {
                        match selected_column {
                            IdxOrName::Idx(index) => available_columns
                                .iter()
                                // Sheets have absolute column names (A, B, C, ...)
                                .position(|col_info| &col_info.absolute_index() == index),
                            IdxOrName::Name(name) => available_columns
                                .iter()
                                .position(|col_info| col_info.name() == name.as_str()),
                        }
                        .ok_or_else(|| {
                            FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into()
                        })
                        .with_context(|| format!("available columns are: {available_columns:?}"))
                    })
                    .collect::<FastExcelResult<_>>()?;

                // We need to sort `available_columns` based on the order of the provided selection.
                // First, we associated every element in the Vec with its position in the selection,
                // and we filter out unselected columns
                let mut cols: Vec<(usize, ColumnInfoNoDtype)> = available_columns
                    .into_iter()
                    .enumerate()
                    .filter_map(|(idx, elem)| {
                        selected_indices
                            .iter()
                            .position(|selected_idx| *selected_idx == idx)
                            .map(|position| (position, elem))
                    })
                    .collect();
                // Then, we sort the columns based on their position in the selection
                cols.sort_by_key(|(pos, _elem)| *pos);

                // And finally, we drop the positions
                Ok(cols.into_iter().map(|(_pos, elem)| elem).collect())
            }
            #[cfg(feature = "python")]
            SelectedColumns::DynamicSelection(use_col_func) => Python::attach(|py| {
                available_columns
                    .into_iter()
                    .filter_map(
                        |col_info| match use_col_func.call1(py, (col_info.clone(),)) {
                            Err(err) => Some(Err(FastExcelErrorKind::InvalidParameters(format!(
                                "`use_columns` callable could not be called ({err})"
                            ))
                            .into())),
                            Ok(should_use_col) => match should_use_col.extract::<bool>(py) {
                                Err(_) => Some(Err(FastExcelErrorKind::InvalidParameters(
                                    "`use_columns` callable should return a boolean".to_string(),
                                )
                                .into())),
                                Ok(true) => Some(Ok(col_info)),
                                Ok(false) => None,
                            },
                        },
                    )
                    .collect()
            }),
            SelectedColumns::DeferredSelection(deferred_selection) => {
                let max_col_index = available_columns
                    .last()
                    .map_or(0, |col| col.absolute_index());
                let concrete_selection = SelectedColumns::Selection(
                    deferred_selection_to_concrete(deferred_selection, max_col_index),
                );

                concrete_selection.select_columns(available_columns)
            }
        }
    }

    const ALPHABET: [char; 26] = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
        'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    ];

    fn col_idx_for_col_as_letter(col: &str) -> FastExcelResult<usize> {
        use FastExcelErrorKind::InvalidParameters;

        if col.is_empty() {
            return Err(InvalidParameters(
                "a column should have at least one character, got none".to_string(),
            )
            .into());
        }

        col.chars()
            //  iterating over all chars reversed, to have a power based on their rank
            .rev()
            .enumerate()
            //  Parses every char, checks its position and returns its numeric equivalent based on
            //  its rank. For example, AB becomes 27 (26 + 1)
            .map(|(idx, col_chr)| {
                let pos_in_alphabet = Self::ALPHABET
                    .iter()
                    .position(|chr| chr == &col_chr)
                    .ok_or_else(|| {
                        FastExcelError::from(InvalidParameters(format!(
                            "Char is not a valid column name: {col_chr}"
                        )))
                    })?;

                Ok(match idx {
                    // in case it's the last char, just return its position
                    0 => pos_in_alphabet,
                    // otherwise, 26^idx * (position + 1)
                    // For example, CBA is 2081:
                    // A -> 0
                    // B -> 26 (53^1 * (1 + 1))
                    // C -> 2028 (26^2 * (2 + 1))
                    _ => 26usize.pow(idx as u32) * (pos_in_alphabet + 1),
                })
            })
            // Sums all previously obtained ranks
            .try_fold(0usize, |acc, elem_result| {
                elem_result.map(|elem| acc + elem)
            })
    }

    fn col_indices_for_letter_range(col_range: &str) -> FastExcelResult<Vec<usize>> {
        use FastExcelErrorKind::InvalidParameters;

        let col_elements = col_range.split(':').collect::<Vec<_>>();
        if col_elements.len() == 2 {
            let start = Self::col_idx_for_col_as_letter(col_elements[0])
                .with_context(|| format!("invalid start element for range \"{col_range}\""))?;

            // Check if this is an open-ended range (empty end element)
            if col_elements[1].is_empty() {
                // For open-ended ranges, we can't return concrete indices yet
                // This will be handled differently in the parsing logic
                return Err(InvalidParameters(format!(
                    "open-ended range detected: \"{col_range}\". This should be handled by col_selection_for_letter_range"
                ))
                .into());
            }

            let end = Self::col_idx_for_col_as_letter(col_elements[1])
                .with_context(|| format!("invalid end element for range \"{col_range}\""))?;

            match start.cmp(&end) {
                cmp::Ordering::Less => Ok((start..=end).collect()),
                cmp::Ordering::Greater => Err(InvalidParameters(format!(
                    "end of range is before start: \"{col_range}\""
                ))
                .into()),
                cmp::Ordering::Equal => {
                    Err(InvalidParameters(format!("empty range: \"{col_range}\"")).into())
                }
            }
        } else {
            Err(InvalidParameters(format!(
                "expected range to contain exactly 2 elements, got {n_elements}: \"{col_range}\"",
                n_elements = col_elements.len()
            ))
            .into())
        }
    }

    fn col_selection_for_letter_range(
        col_range: &str,
    ) -> FastExcelResult<Vec<DeferredColumnSelection>> {
        use FastExcelErrorKind::InvalidParameters;

        let col_elements = col_range.split(':').collect::<Vec<_>>();
        if col_elements.len() == 2 {
            // Check if this is a from-beginning range (empty start element)
            if col_elements[0].is_empty() {
                if col_elements[1].is_empty() {
                    return Err(InvalidParameters(format!(
                        "cannot have both start and end empty in range: \"{col_range}\""
                    ))
                    .into());
                }
                let end = Self::col_idx_for_col_as_letter(col_elements[1])
                    .with_context(|| format!("invalid end element for range \"{col_range}\""))?;
                return Ok(vec![DeferredColumnSelection::FromBeginningRange(end)]);
            }

            let start = Self::col_idx_for_col_as_letter(col_elements[0])
                .with_context(|| format!("invalid start element for range \"{col_range}\""))?;

            // Check if this is an open-ended range (empty end element)
            if col_elements[1].is_empty() {
                return Ok(vec![DeferredColumnSelection::OpenEndedRange(start)]);
            }

            let end = Self::col_idx_for_col_as_letter(col_elements[1])
                .with_context(|| format!("invalid end element for range \"{col_range}\""))?;

            match start.cmp(&end) {
                cmp::Ordering::Less => Ok((start..=end)
                    .map(|idx| DeferredColumnSelection::Fixed(IdxOrName::Idx(idx)))
                    .collect()),
                cmp::Ordering::Greater => Err(InvalidParameters(format!(
                    "end of range is before start: \"{col_range}\""
                ))
                .into()),
                cmp::Ordering::Equal => {
                    Err(InvalidParameters(format!("empty range: \"{col_range}\"")).into())
                }
            }
        } else {
            Err(InvalidParameters(format!(
                "expected range to contain exactly 2 elements, got {n_elements}: \"{col_range}\"",
                n_elements = col_elements.len()
            ))
            .into())
        }
    }
}

impl FromStr for SelectedColumns {
    type Err = FastExcelError;

    fn from_str(s: &str) -> FastExcelResult<Self> {
        let uppercase_s = s.to_uppercase();
        let parts: Vec<&str> = uppercase_s.split(',').collect();
        let has_open_ended = parts
            .iter()
            .any(|p| p.contains(':') && (p.ends_with(':') || p.starts_with(':')));

        if has_open_ended {
            // Use deferred selection logic
            let deferred_selections = parts
                .iter()
                .map(|part| {
                    if part.contains(':') {
                        Self::col_selection_for_letter_range(part).map(|mut selections| {
                            std::mem::take(&mut selections)
                                .into_iter()
                                .collect::<Vec<_>>()
                        })
                    } else {
                        Self::col_idx_for_col_as_letter(part)
                            .map(|idx| vec![DeferredColumnSelection::Fixed(IdxOrName::Idx(idx))])
                    }
                })
                .collect::<Result<Vec<Vec<_>>, _>>()?
                .into_iter()
                .flatten()
                .collect();
            Ok(Self::DeferredSelection(deferred_selections))
        } else {
            // Use the original immediate resolution logic for backwards compatibility
            let unique_col_indices: HashSet<usize> = parts
                .iter()
                .map(|col_or_range| {
                    if col_or_range.contains(':') {
                        Self::col_indices_for_letter_range(col_or_range)
                    } else {
                        Self::col_idx_for_col_as_letter(col_or_range).map(|idx| vec![idx])
                    }
                })
                .collect::<FastExcelResult<Vec<_>>>()?
                .into_iter()
                .flatten()
                .collect();
            let mut sorted_col_indices: Vec<usize> = unique_col_indices.into_iter().collect();
            sorted_col_indices.sort();
            Ok(Self::Selection(
                sorted_col_indices.into_iter().map(IdxOrName::Idx).collect(),
            ))
        }
    }
}

/// Visibility of a sheet.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SheetVisible {
    Visible,
    Hidden,
    VeryHidden,
}

impl From<CalamineSheetVisible> for SheetVisible {
    fn from(value: CalamineSheetVisible) -> Self {
        match value {
            CalamineSheetVisible::Visible => SheetVisible::Visible,
            CalamineSheetVisible::Hidden => SheetVisible::Hidden,
            CalamineSheetVisible::VeryHidden => SheetVisible::VeryHidden,
        }
    }
}

/// A single sheet in an Excel file.
#[derive(Debug)]
#[cfg_attr(feature = "python", pyclass(name = "_ExcelSheet"))]
pub struct ExcelSheet {
    sheet_meta: CalamineSheet,
    header: Header,
    pagination: Pagination,
    data: ExcelSheetData<'static>,
    height: Option<usize>,
    total_height: Option<usize>,
    width: Option<usize>,
    limit: usize,
    opts: LoadSheetOrTableOptions,
    selected_columns: Vec<ColumnInfo>,
    available_columns: AvailableColumns,
}

impl ExcelSheet {
    pub(crate) fn data(&self) -> &ExcelSheetData<'_> {
        &self.data
    }

    pub(crate) fn try_new(
        sheet_meta: CalamineSheet,
        data: ExcelSheetData<'static>,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<Self> {
        let header = Header::new(opts.data_header_row(), opts.column_names.clone());
        let available_columns_info =
            build_available_columns_info(&data, &opts.selected_columns, &header)?;
        let selected_columns_info = opts
            .selected_columns
            .select_columns(available_columns_info)?;

        let pagination = match &data {
            ExcelSheetData::Owned(range) => {
                Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, range)?
            }
            ExcelSheetData::Ref(range) => {
                Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, range)?
            }
        };

        let mut sheet = ExcelSheet {
            sheet_meta,
            header,
            pagination,
            data,
            opts,
            height: None,
            total_height: None,
            width: None,
            // Will be replaced
            limit: 0,
            available_columns: AvailableColumns::Pending,
            // Empty vec as It'll be replaced
            selected_columns: Vec::with_capacity(0),
        };
        sheet.limit = sheet.compute_limit();

        // Finalizing column info (figure out dtypes for every column)
        let row_limit = sheet.schema_sample_rows();
        let selected_columns = finalize_column_info(
            selected_columns_info,
            &sheet.data,
            sheet.offset(),
            row_limit,
            sheet.opts.dtypes.as_ref(),
            &sheet.opts.dtype_coercion,
            sheet.opts.whitespace_as_null,
        )?;

        sheet.selected_columns = selected_columns;

        Ok(sheet)
    }

    fn ensure_available_columns_loaded(&mut self) -> FastExcelResult<()> {
        let available_columns = match &self.available_columns {
            AvailableColumns::Pending => {
                let available_columns_info = build_available_columns_info(
                    &self.data,
                    &self.opts.selected_columns,
                    &self.header,
                )?;
                let final_info = finalize_column_info(
                    available_columns_info,
                    self.data(),
                    self.offset(),
                    self.limit(),
                    self.opts.dtypes.as_ref(),
                    &self.opts.dtype_coercion,
                    self.opts.whitespace_as_null,
                )?;
                AvailableColumns::Loaded(final_info)
            }
            AvailableColumns::Loaded(_) => return Ok(()),
        };

        self.available_columns = available_columns;
        Ok(())
    }

    fn load_available_columns(&mut self) -> FastExcelResult<&[ColumnInfo]> {
        self.ensure_available_columns_loaded()?;
        self.available_columns.as_loaded()
    }

    fn compute_limit(&self) -> usize {
        let upper_bound = if self.opts.skip_whitespace_tail_rows {
            self.data.height_without_tail_whitespace()
        } else {
            self.data.height()
        };
        if let Some(n_rows) = self.pagination.n_rows {
            let limit = self.offset() + n_rows;
            if limit < upper_bound {
                return limit;
            }
        }
        upper_bound
    }

    pub(crate) fn limit(&self) -> usize {
        self.limit
    }

    pub(crate) fn schema_sample_rows(&self) -> usize {
        get_schema_sample_rows(self.opts.schema_sample_rows, self.offset(), self.limit())
    }

    pub fn width(&mut self) -> usize {
        self.width.unwrap_or_else(|| {
            let width = self.data.width();
            self.width = Some(width);
            width
        })
    }

    pub fn height(&mut self) -> usize {
        self.height.unwrap_or_else(|| {
            use crate::data::generate_row_selector;
            let height =
                generate_row_selector(self.pagination.skip_rows(), self.offset(), self.limit())
                    .map(|selector| selector.len())
                    .unwrap_or_else(|_| self.limit() - self.offset());
            self.height = Some(height);
            height
        })
    }

    pub fn total_height(&mut self) -> usize {
        self.total_height.unwrap_or_else(|| {
            let total_height = self.data.height() - self.header.offset();
            self.total_height = Some(total_height);
            total_height
        })
    }

    pub fn offset(&self) -> usize {
        self.header.offset() + self.pagination.offset()
    }

    pub fn selected_columns(&self) -> &Vec<ColumnInfo> {
        &self.selected_columns
    }

    pub fn available_columns(&mut self) -> FastExcelResult<Vec<ColumnInfo>> {
        self.load_available_columns().map(|cols| cols.to_vec())
    }

    pub fn specified_dtypes(&self) -> Option<&DTypes> {
        self.opts.dtypes.as_ref()
    }

    pub fn name(&self) -> &str {
        &self.sheet_meta.name
    }

    pub fn visible(&self) -> SheetVisible {
        self.sheet_meta.visible.into()
    }

    pub fn to_columns(&self) -> FastExcelResult<Vec<FastExcelColumn>> {
        self.selected_columns
            .iter()
            .map(|column_info| {
                let offset = self.offset();
                let limit = self.limit();
                let whitespace_as_null = self.opts.whitespace_as_null;

                match self.data() {
                    ExcelSheetData::Owned(range) => FastExcelColumn::try_from_column_info(
                        column_info,
                        range,
                        offset,
                        limit,
                        whitespace_as_null,
                    ),
                    ExcelSheetData::Ref(range) => FastExcelColumn::try_from_column_info(
                        column_info,
                        range,
                        offset,
                        limit,
                        whitespace_as_null,
                    ),
                }
            })
            .collect()
    }

    #[cfg(feature = "polars")]
    pub fn to_polars(&self) -> FastExcelResult<DataFrame> {
        let pl_columns = self.to_columns()?.into_iter().map(Into::into).collect();
        DataFrame::new_infer_height(pl_columns).map_err(|err| {
            FastExcelErrorKind::Internal(format!("could not create DataFrame: {err:?}")).into()
        })
    }
}

#[cfg(feature = "__pyo3-tests")]
#[cfg(test)]
mod tests {
    use super::*;
    use pretty_assertions::assert_eq;
    use pyo3::{
        prelude::PyListMethods,
        types::{PyList, PyString},
    };
    use rstest::rstest;

    #[test]
    fn selected_columns_from_none() {
        assert_eq!(
            TryInto::<SelectedColumns>::try_into(None).unwrap(),
            SelectedColumns::All
        )
    }

    #[test]
    fn selected_columns_from_list_of_valid_ints() {
        Python::attach(|py| {
            let py_list = PyList::new(py, vec![0, 1, 2]).expect("could not create PyList");
            assert_eq!(
                TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap(),
                SelectedColumns::Selection([0, 1, 2].into_iter().map(IdxOrName::Idx).collect())
            )
        });
    }

    #[test]
    fn selected_columns_from_list_of_valid_strings() {
        Python::attach(|py| {
            let py_list = PyList::new(py, vec!["foo", "bar"]).expect("could not create PyList");
            assert_eq!(
                TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap(),
                SelectedColumns::Selection(
                    ["foo", "bar"]
                        .iter()
                        .map(ToString::to_string)
                        .map(IdxOrName::Name)
                        .collect()
                )
            )
        });
    }

    #[test]
    fn selected_columns_from_list_of_valid_strings_and_ints() {
        Python::attach(|py| {
            let py_list = PyList::new(py, vec!["foo", "bar"]).expect("could not create PyList");
            py_list.append(42).unwrap();
            py_list.append(5).unwrap();
            assert_eq!(
                TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap(),
                SelectedColumns::Selection(vec![
                    IdxOrName::Name("foo".to_string()),
                    IdxOrName::Name("bar".to_string()),
                    IdxOrName::Idx(42),
                    IdxOrName::Idx(5)
                ])
            )
        });
    }

    #[test]
    fn selected_columns_from_invalid_ints() {
        Python::attach(|py| {
            let py_list = PyList::new(py, vec![0, 2, -1]).expect("could not create PyList");
            let err = TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap_err();

            assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_)));
        });
    }

    #[test]
    fn selected_columns_from_empty_int_list() {
        Python::attach(|py| {
            let py_list = PyList::new(py, Vec::<usize>::new()).expect("could not create PyList");
            let err = TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap_err();

            assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_)));
        });
    }

    #[test]
    fn selected_columns_from_empty_string_list() {
        Python::attach(|py| {
            let py_list = PyList::new(py, Vec::<String>::new()).expect("could not create PyList");
            let err = TryInto::<SelectedColumns>::try_into(Some(py_list.as_ref())).unwrap_err();

            assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_)));
        });
    }

    #[rstest]
    // Standard unique columns
    #[case("A,B,D", vec![0, 1, 3])]
    // Standard unique columns + range
    #[case("A,B:E,Y", vec![0, 1, 2, 3, 4, 24])]
    // Standard unique column + ranges with mixed case
    #[case("A:c,b:E,w,Y:z", vec![0, 1, 2, 3, 4, 22, 24, 25])]
    // Ranges beyond Z
    #[case("A,y:AB", vec![0, 24, 25, 26, 27])]
    #[case("BB:BE,DDC:DDF", vec![53, 54, 55, 56, 2810, 2811, 2812, 2813])]
    fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected_indices: Vec<usize>) {
        Python::attach(|py| {
            let expected_range = SelectedColumns::Selection(
                expected_indices.into_iter().map(IdxOrName::Idx).collect(),
            );
            let input = PyString::new(py, raw);

            let range = TryInto::<SelectedColumns>::try_into(Some(input.as_ref()))
                .expect("expected a valid column selection");

            assert_eq!(range, expected_range)
        })
    }

    #[rstest]
    #[case("B:")]
    #[case("A,C:")]
    #[case("A:")]
    #[case(":E")]
    #[case(":C")]
    #[case(":A")]
    #[case(":C,E:")]
    fn selected_columns_from_valid_open_ended_ranges(#[case] raw: &str) {
        Python::attach(|py| {
            let input = PyString::new(py, raw);

            let range = TryInto::<SelectedColumns>::try_into(Some(input.as_ref()))
                .expect("expected a valid column selection");

            assert!(matches!(range, SelectedColumns::DeferredSelection(_)));
        })
    }

    #[rstest]
    // Standard unique columns
    #[case("", "at least one character")]
    // empty range
    #[case("a:a,b:d,e", "empty range")]
    // end before start
    #[case("b:a", "end of range is before start")]
    // both start and end empty
    #[case(":", "cannot have both start and end empty")]
    // too many elements
    #[case("a:b:e", "exactly 2 elements, got 3")]
    fn selected_columns_from_invalid_ranges(#[case] raw: &str, #[case] message: &str) {
        Python::attach(|py| {
            let input = PyString::new(py, raw);

            let err = TryInto::<SelectedColumns>::try_into(Some(input.as_ref()))
                .expect_err("expected an error");

            match err.kind {
                FastExcelErrorKind::InvalidParameters(detail) => {
                    if !detail.contains(message) {
                        panic!("expected \"{detail}\" to contain \"{message}\"")
                    }
                }
                _ => panic!("Expected error to be InvalidParameters, got {err:?}"),
            }
        })
    }
}


================================================
FILE: src/types/excelsheet/polars.rs
================================================
use crate::{FastExcelColumn, FastExcelSeries};
use polars_core::{
    frame::column::{Column as PolarsColumn, ScalarColumn},
    prelude::DataType,
    scalar::Scalar,
};

impl From<FastExcelColumn> for PolarsColumn {
    fn from(column: FastExcelColumn) -> Self {
        let name = column.name().into();
        match column.data {
            FastExcelSeries::Null => PolarsColumn::Scalar(ScalarColumn::new(
                name,
                Scalar::null(DataType::Null),
                column.len(),
            )),
            FastExcelSeries::Bool(values) => PolarsColumn::new(name, values),
            FastExcelSeries::String(values) => PolarsColumn::new(name, values),
            FastExcelSeries::Int(values) => PolarsColumn::new(name, values),
            FastExcelSeries::Float(values) => PolarsColumn::new(name, values),
            FastExcelSeries::Datetime(values) => PolarsColumn::new(name, values),
            FastExcelSeries::Date(values) => PolarsColumn::new(name, values),
            FastExcelSeries::Duration(values) => PolarsColumn::new(name, values),
        }
    }
}


================================================
FILE: src/types/excelsheet/python.rs
================================================
use std::{collections::HashSet, sync::Arc};

use arrow_array::{RecordBatch, StructArray};
use arrow_schema::Field;
use pyo3::{
    Borrowed, Bound, FromPyObject, IntoPyObject, Py, PyAny, PyErr, PyResult, Python, pyclass,
    pymethods,
    types::{PyAnyMethods, PyCapsule, PyList, PyListMethods, PyString, PyTuple},
};
use pyo3_arrow::ffi::{to_array_pycapsules, to_schema_pycapsule};

use crate::{
    ExcelSheet,
    data::{
        ExcelSheetData, record_batch_from_data_and_columns_with_skip_rows,
        selected_columns_to_schema,
    },
    error::{
        ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult,
    },
    types::{
        dtype::DTypes,
        excelsheet::{SelectedColumns, SheetVisible, SkipRows, column_info::ColumnInfo},
        idx_or_name::IdxOrName,
    },
};

impl TryFrom<&Bound<'_, PyList>> for SelectedColumns {
    type Error = FastExcelError;

    fn try_from(py_list: &Bound<'_, PyList>) -> FastExcelResult<Self> {
        use FastExcelErrorKind::InvalidParameters;

        if py_list.is_empty() {
            Err(InvalidParameters("list of selected columns is empty".to_string()).into())
        } else if let Ok(selection) = py_list.extract::<Vec<IdxOrName>>() {
            Ok(Self::Selection(selection))
        } else {
            Err(
                InvalidParameters(format!("expected list[int] | list[str], got {py_list:?}"))
                    .into(),
            )
        }
    }
}

impl TryFrom<Option<&Bound<'_, PyAny>>> for SelectedColumns {
    type Error = FastExcelError;

    fn try_from(py_any_opt: Option<&Bound<'_, PyAny>>) -> FastExcelResult<Self> {
        match py_any_opt {
            None => Ok(Self::All),
            Some(py_any) => {
                // Not trying to downcast to PyNone here as we assume that this would result in
                // py_any_opt being None
                if let Ok(py_str) = py_any.extract::<String>() {
                    py_str.parse()
                } else if let Ok(py_list) = py_any.cast::<PyList>() {
                    py_list.try_into()
                } else if let Ok(py_function) = py_any.extract::<Py<PyAny>>() {
                    Ok(Self::DynamicSelection(py_function))
                } else {
                    Err(FastExcelErrorKind::InvalidParameters(format!(
                        "unsupported object type {object_type}",
                        object_type = py_any.get_type()
                    ))
                    .into())
                }
            }
            .with_context(|| {
                format!("could not determine selected columns from provided object: {py_any}")
            }),
        }
    }
}

impl<'py> IntoPyObject<'py> for &SheetVisible {
    type Target = PyString;

    type Output = Bound<'py, Self::Target>;

    type Error = FastExcelError;

    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
        Ok(PyString::new(
            py,
            match self {
                SheetVisible::Visible => "visible",
                SheetVisible::Hidden => "hidden",
                SheetVisible::VeryHidden => "veryhidden",
            },
        ))
    }
}

impl SkipRows {
    pub(crate) fn should_skip_row(&self, row_idx: usize, py: Python) -> FastExcelResult<bool> {
        match self {
            SkipRows::Simple(offset) => Ok(row_idx < *offset),
            SkipRows::List(skip_set) => Ok(skip_set.contains(&row_idx)),
            SkipRows::Callable(func) => {
                let result = func.call1(py, (row_idx,)).map_err(|e| {
                    FastExcelErrorKind::InvalidParameters(format!(
                        "Error calling skip_rows function for row {row_idx}: {e}"
                    ))
                })?;
                result.extract::<bool>(py).map_err(|e| {
                    FastExcelErrorKind::InvalidParameters(format!(
                        "skip_rows callable must return bool, got error: {e}"
                    ))
                    .into()
                })
            }
            SkipRows::SkipEmptyRowsAtBeginning => {
                // This is handled by calamine's FirstNonEmptyRow in the header logic
                // For array creation, we don't need additional filtering
                Ok(false)
            }
        }
    }
}

#[derive(Debug, Clone)]
#[pyclass(skip_from_py_object)]
pub(crate) struct CellError {
    /// `(int, int)`. The original row and column of the error
    #[pyo3(get)]
    pub position: (usize, usize),
    /// `int`. The row offset
    #[pyo3(get)]
    pub row_offset: usize,
    /// `str`. The error message
    #[pyo3(get)]
    pub detail: String,
}

#[pymethods]
impl CellError {
    #[getter]
    pub fn offset_position(&self) -> (usize, usize) {
        let (row, col) = self.position;
        (row - self.row_offset, col)
    }

    pub fn __repr__(&self) -> String {
        let (row, col) = self.position;
        let (offset_row, offset_col) = self.offset_position();
        format!(
            "CellError(position=({row}, {col}), offset_position=({offset_row}, {offset_col}), row_offset={row_offset}, detail={detail:?})",
            row_offset = self.row_offset,
            detail = &self.detail,
        )
    }
}

#[pyclass]
pub(crate) struct CellErrors {
    pub errors: Vec<CellError>,
}

#[pymethods]
impl CellErrors {
    #[getter]
    pub fn errors<'p>(&'p self, _py: Python<'p>) -> Vec<CellError> {
        self.errors.clone()
    }

    pub fn __repr__(&self) -> String {
        let errors_repr: Vec<String> = self.errors.iter().map(|e| e.__repr__()).collect();
        format!("CellErrors(errors=[{}])", errors_repr.join(", "))
    }
}

impl<'a, 'py> FromPyObject<'a, 'py> for SkipRows {
    type Error = PyErr;
    fn extract(obj: Borrowed<'a, 'py, PyAny>) -> Result<Self, Self::Error> {
        // Handle None case
        if obj.is_none() {
            return Ok(SkipRows::SkipEmptyRowsAtBeginning);
        }

        // Try to extract as int first
        if let Ok(skip_count) = obj.extract::<usize>() {
            return Ok(SkipRows::Simple(skip_count));
        }

        // Try to extract as list of integers
        if let Ok(skip_list) = obj.extract::<Vec<usize>>() {
            let skip_set: HashSet<usize> = skip_list.into_iter().collect();
            return Ok(SkipRows::List(skip_set));
        }

        // Check if it's callable
        if obj.hasattr("__call__").unwrap_or(false) {
            return Ok(SkipRows::Callable(Arc::new(obj.to_owned().into())));
        }

        Err(FastExcelErrorKind::InvalidParameters(
            "skip_rows must be int, list of int, callable, or None".to_string(),
        )
        .into())
        .into_pyresult()
    }
}

impl TryFrom<&ExcelSheet> for RecordBatch {
    type Error = FastExcelError;

    fn try_from(sheet: &ExcelSheet) -> FastExcelResult<Self> {
        let offset = sheet.offset();
        let limit = sheet.limit();

        match &sheet.data {
            ExcelSheetData::Owned(range) => record_batch_from_data_and_columns_with_skip_rows(
                &sheet.selected_columns,
                range,
                sheet.pagination.skip_rows(),
                offset,
                limit,
                sheet.opts.whitespace_as_null,
            ),
            ExcelSheetData::Ref(range) => record_batch_from_data_and_columns_with_skip_rows(
                &sheet.selected_columns,
                range,
                sheet.pagination.skip_rows(),
                offset,
                limit,
                sheet.opts.whitespace_as_null,
            ),
        }
        .with_context(|| format!("could not convert sheet {} to RecordBatch", sheet.name()))
    }
}

// NOTE: These proxy python implems are required because `#[getter]` does not play well with `cfg_attr`:
// * https://github.com/PyO3/pyo3/issues/1003
// * https://github.com/PyO3/pyo3/issues/780
#[pymethods]
impl ExcelSheet {
    #[getter("width")]
    pub fn py_width(&mut self) -> usize {
        self.width()
    }

    #[getter("height")]
    pub fn py_height(&mut self) -> usize {
        self.height()
    }

    #[getter("total_height")]
    pub fn py_total_height(&mut self) -> usize {
        self.total_height()
    }

    #[getter("offset")]
    pub fn py_offset(&self) -> usize {
        self.offset()
    }

    #[getter("selected_columns")]
    pub fn py_selected_columns(&self) -> Vec<ColumnInfo> {
        self.selected_columns().to_owned()
    }

    #[pyo3(name = "available_columns")]
    pub fn py_available_columns(&mut self) -> FastExcelResult<Vec<ColumnInfo>> {
        self.available_columns()
    }

    #[getter("specified_dtypes")]
    pub fn py_specified_dtypes(&self) -> Option<&DTypes> {
        self.specified_dtypes()
    }

    #[getter("name")]
    pub fn py_name(&self) -> &str {
        self.name()
    }

    #[getter("visible")]
    pub fn py_visible<'py>(&'py self, py: Python<'py>) -> FastExcelResult<Bound<'py, PyString>> {
        let visible: SheetVisible = self.visible();
        (&visible).into_pyobject(py)
    }

    #[cfg(feature = "pyarrow")]
    pub fn to_arrow<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
        use pyo3::IntoPyObjectExt;

        use crate::error::py_errors::IntoPyResult;

        py.detach(|| RecordBatch::try_from(self))
            .with_context(|| {
                format!(
                    "could not create RecordBatch from sheet \"{}\"",
                    self.name()
                )
            })
            .and_then(|rb| {
                use arrow_pyarrow::ToPyArrow;

                rb.to_pyarrow(py)
                    .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into())
            })
            .with_context(|| {
                format!(
                    "could not convert RecordBatch to pyarrow for sheet \"{}\"",
                    self.name()
                )
            })
            .into_pyresult()
            .and_then(|obj| obj.into_bound_py_any(py))
    }

    #[cfg(feature = "pyarrow")]
    pub fn to_arrow_with_errors<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
        use arrow_pyarrow::IntoPyArrow;
        use pyo3::IntoPyObjectExt;

        use crate::data::record_batch_from_data_and_columns_with_errors;

        let offset = self.offset();
        let limit = self.limit();

        let (rb, errors) = py
            .detach(|| {
                record_batch_from_data_and_columns_with_errors(
                    &self.selected_columns,
                    self.data(),
                    offset,
                    limit,
                    self.opts.whitespace_as_null,
                )
            })
            .with_context(|| {
                format!(
                    "could not create RecordBatch from sheet \"{}\"",
                    self.name()
                )
            })?;

        let rb = rb
            .into_pyarrow(py)
            .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into())
            .with_context(|| {
                format!(
                    "could not convert RecordBatch to pyarrow for sheet \"{}\"",
                    self.name()
                )
            })?;
        (rb, errors).into_bound_py_any(py)
    }

    /// Export the schema as an [`ArrowSchema`] [`PyCapsule`].
    ///
    /// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export>
    ///
    /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
    /// [`PyCapsule`]: pyo3::types::PyCapsule
    pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyCapsule>> {
        let schema = selected_columns_to_schema(&self.selected_columns);
        Ok(to_schema_pycapsule(py, &schema)?)
    }

    /// Export the schema and data as a pair of [`ArrowSchema`] and [`ArrowArray`] [`PyCapsules`]
    ///
    /// The optional `requested_schema` parameter allows for potential schema conversion.
    ///
    /// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export>
    ///
    /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
    /// [`ArrowArray`]: arrow_array::ffi::FFI_ArrowArray
    /// [`PyCapsules`]: pyo3::types::PyCapsule
    pub fn __arrow_c_array__<'py>(
        &self,
        py: Python<'py>,
        requested_schema: Option<Bound<'py, PyCapsule>>,
    ) -> PyResult<Bound<'py, PyTuple>> {
        let record_batch = RecordBatch::try_from(self)
            .with_context(|| {
                format!(
                    "could not create RecordBatch from sheet \"{}\"",
                    self.name()
                )
            })
            .into_pyresult()?;

        let field = Field::new_struct("", record_batch.schema_ref().fields().clone(), false);
        let array = Arc::new(StructArray::from(record_batch));
        Ok(to_array_pycapsules(
            py,
            field.into(),
            array.as_ref(),
            requested_schema,
        )?)
    }

    pub fn __repr__(&self) -> String {
        format!("ExcelSheet<{}>", self.name())
    }
}


================================================
FILE: src/types/excelsheet/table.rs
================================================
use crate::error::{FastExcelErrorKind, FastExcelResult};
use calamine::{Data, Sheets, Table};
use std::io::{Read, Seek};

pub(crate) fn extract_table_names<'a, RS: Read + Seek>(
    sheets: &'a mut Sheets<RS>,
    sheet_name: Option<&str>,
) -> FastExcelResult<Vec<&'a String>> {
    match sheets {
        Sheets::Xlsx(xlsx) => {
            // Internally checks if tables already loaded; is fast
            xlsx.load_tables()?;

            match sheet_name {
                None => Ok(xlsx.table_names()),
                Some(sn) => Ok(xlsx.table_names_in_sheet(sn)),
            }
        }
        _ => Err(FastExcelErrorKind::Internal(
            "Currently only XLSX files are supported for tables".to_string(),
        )
        .into()),
    }
}

pub(crate) fn extract_table_range<RS: Read + Seek>(
    name: &str,
    sheets: &mut Sheets<RS>,
) -> FastExcelResult<Table<Data>> {
    match sheets {
        Sheets::Xlsx(xlsx) => {
            // Internally checks if tables already loaded; is fast
            xlsx.load_tables()?;

            let table_result = xlsx.table_by_name(name);
            let table = table_result?;

            Ok(table)
        }
        _ => Err(FastExcelErrorKind::Internal(
            "Currently only XLSX files are supported for tables".to_string(),
        )
        .into()),
    }
}


================================================
FILE: src/types/exceltable/mod.rs
================================================
#[cfg(feature = "python")]
mod python;

use calamine::{Data, Range, Table};
#[cfg(feature = "polars")]
use polars_core::frame::DataFrame;
#[cfg(feature = "python")]
use pyo3::pyclass;

use crate::{
    FastExcelColumn, FastExcelErrorKind, IdxOrName, LoadSheetOrTableOptions, SelectedColumns,
    data::height_without_tail_whitespace,
    error::{ErrorContext, FastExcelResult},
    types::{
        dtype::DTypes,
        excelsheet::{
            Header, Pagination,
            column_info::{
                AvailableColumns, ColumnInfo, build_available_columns_info, finalize_column_info,
            },
            deferred_selection_to_concrete,
        },
    },
    utils::schema::get_schema_sample_rows,
};

/// A single table in an Excel file.
#[derive(Debug)]
#[cfg_attr(feature = "python", pyclass(name = "_ExcelTable"))]
pub struct ExcelTable {
    name: String,
    sheet_name: String,
    selected_columns: Vec<ColumnInfo>,
    available_columns: AvailableColumns,
    table: Table<Data>,
    header: Header,
    pagination: Pagination,
    opts: LoadSheetOrTableOptions,
    height: Option<usize>,
    total_height: Option<usize>,
    width: Option<usize>,
    limit: usize,
}

impl ExcelTable {
    fn extract_selected_columns_and_table_columns(
        table: &Table<Data>,
        selected_columns: &[IdxOrName],
    ) -> FastExcelResult<(Vec<String>, Vec<IdxOrName>)> {
        let table_columns: Vec<String> = table.columns().into();
        let column_offset = table.data().start().map_or(0, |(_row, col)| col as usize);
        let selected_column_indices = selected_columns
            .iter()
            .map(|idx_or_name| match idx_or_name {
                IdxOrName::Idx(idx) => Ok(*idx),
                IdxOrName::Name(name) => table_columns
                    .iter()
                    .enumerate()
                    .find_map(|(idx, col_name)| {
                        (col_name.as_str() == name.as_str()).then_some(idx + column_offset)
                    })
                    .ok_or_else(|| FastExcelErrorKind::ColumnNotFound(name.clone().into()).into())
                    .with_context(|| format!("available columns are: {table_columns:?}")),
            })
            .collect::<FastExcelResult<Vec<usize>>>()?;

        let table_columns = table_columns
            .into_iter()
            .enumerate()
            .filter_map(|(idx, col_name)| {
                selected_column_indices
                    .contains(&(idx + column_offset))
                    .then_some(col_name)
            })
            .collect();

        let selected_columns = selected_column_indices
            .into_iter()
            .map(Into::into)
            .collect();

        Ok((table_columns, selected_columns))
    }

    /// Builds a `Header` for a table. This might update the column selection, if provided
    fn build_header_and_update_selection(
        table: &Table<Data>,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<(Header, LoadSheetOrTableOptions)> {
        Ok(match (&opts.column_names, opts.header_row) {
            (None, None) => {
                // If there is a column selection, we need to convert all elements to column
                // indices. This is required because we will be providing the header, and it
                // it is required to use an index-based selection when custom column names are provided
                match &opts.selected_columns {
                    SelectedColumns::Selection(selected_columns) => {
                        let (table_columns, selected_columns) =
                            Self::extract_selected_columns_and_table_columns(
                                table,
                                selected_columns,
                            )?;
                        let opts =
                            opts.selected_columns(SelectedColumns::Selection(selected_columns));
                        (Header::With(table_columns), opts)
                    }
                    SelectedColumns::DeferredSelection(deferred_selection) => {
                        let concrete_columns = deferred_selection_to_concrete(
                            deferred_selection,
                            table.data().end().map_or(0, |(_row, col)| col as usize),
                        );
                        let (table_columns, selected_columns) =
                            Self::extract_selected_columns_and_table_columns(
                                table,
                                &concrete_columns,
                            )?;
                        let opts =
                            opts.selected_columns(SelectedColumns::Selection(selected_columns));
                        (Header::With(table_columns), opts)
                    }
                    _ => (Header::With(table.columns().into()), opts),
                }
            }
            (None, Some(row)) => (Header::At(row), opts),
            (Some(column_names), _) => (Header::With(column_names.clone()), opts),
        })
    }

    pub(crate) fn try_new(
        table: Table<Data>,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<Self> {
        let pagination = Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, table.data())?;

        let (header, opts) = Self::build_header_and_update_selection(&table, opts)?;

        let available_columns_info =
            build_available_columns_info(table.data(), &opts.selected_columns, &header)?;
        let selected_columns_info = opts
            .selected_columns
            .select_columns(available_columns_info)?;

        let mut excel_table = ExcelTable {
            name: table.name().to_owned(),
            sheet_name: table.sheet_name().to_owned(),
            available_columns: AvailableColumns::Pending,
            // Empty vec as it'll be replaced
            selected_columns: Vec::with_capacity(0),
            table,
            header,
            pagination,
            opts,
            height: None,
            total_height: None,
            width: None,
            // Will be replaced
            limit: 0,
        };
        excel_table.limit = excel_table.compute_limit();

        let row_limit = get_schema_sample_rows(
            excel_table.opts.schema_sample_rows,
            excel_table.offset(),
            excel_table.limit(),
        );

        // Finalizing column info
        let selected_columns = finalize_column_info(
            selected_columns_info,
            excel_table.data(),
            excel_table.offset(),
            row_limit,
            excel_table.opts.dtypes.as_ref(),
            &excel_table.opts.dtype_coercion,
            excel_table.opts.whitespace_as_null,
        )?;

        // Figure out dtype for every column
        excel_table.selected_columns = selected_columns;

        Ok(excel_table)
    }

    pub(crate) fn data(&self) -> &Range<Data> {
        self.table.data()
    }

    fn ensure_available_columns_loaded(&mut self) -> FastExcelResult<()> {
        let available_columns = match &self.available_columns {
            AvailableColumns::Pending => {
                let available_columns_info = build_available_columns_info(
                    self.table.data(),
                    &self.opts.selected_columns,
                    &self.header,
                )?;
                let final_info = finalize_column_info(
                    available_columns_info,
                    self.data(),
                    self.offset(),
                    self.limit(),
                    self.opts.dtypes.as_ref(),
                    &self.opts.dtype_coercion,
                    self.opts.whitespace_as_null,
                )?;
                AvailableColumns::Loaded(final_info)
            }
            AvailableColumns::Loaded(_) => return Ok(()),
        };

        self.available_columns = available_columns;
        Ok(())
    }

    fn load_available_columns(&mut self) -> FastExcelResult<&[ColumnInfo]> {
        self.ensure_available_columns_loaded()?;
        self.available_columns.as_loaded()
    }

    pub fn offset(&self) -> usize {
        self.header.offset() + self.pagination.offset()
    }

    fn compute_limit(&self) -> usize {
        let upper_bound = if self.opts.skip_whitespace_tail_rows {
            height_without_tail_whitespace(self.data()).unwrap_or_else(|| self.data().height())
        } else {
            self.data().height()
        };
        if let Some(n_rows) = self.pagination.n_rows() {
            let limit = self.offset() + n_rows;
            if limit < upper_bound {
                return limit;
            }
        }
        upper_bound
    }

    pub fn limit(&self) -> usize {
        self.limit
    }

    pub fn selected_columns(&self) -> Vec<ColumnInfo> {
        self.selected_columns.clone()
    }

    pub fn available_columns(&mut self) -> FastExcelResult<Vec<ColumnInfo>> {
        self.load_available_columns().map(|cols| cols.to_vec())
    }

    pub fn specified_dtypes(&self) -> Option<&DTypes> {
        self.opts.dtypes.as_ref()
    }

    pub fn width(&mut self) -> usize {
        self.width.unwrap_or_else(|| {
            let width = self.data().width();
            self.width = Some(width);
            width
        })
    }

    pub fn height(&mut self) -> usize {
        self.height.unwrap_or_else(|| {
            let height = self.limit() - self.offset();
            self.height = Some(height);
            height
        })
    }

    pub fn total_height(&mut self) -> usize {
        self.total_height.unwrap_or_else(|| {
            let total_height = self.data().height() - self.header.offset();
            self.total_height = Some(total_height);
            total_height
        })
    }

    pub fn name(&self) -> &str {
        &self.name
    }

    pub fn sheet_name(&self) -> &str {
        &self.sheet_name
    }

    pub fn to_columns(&self) -> FastExcelResult<Vec<FastExcelColumn>> {
        self.selected_columns
            .iter()
            .map(|column_info| {
                FastExcelColumn::try_from_column_info(
                    column_info,
                    self.table.data(),
                    self.offset(),
                    self.limit(),
                    self.opts.whitespace_as_null,
                )
            })
            .collect()
    }

    #[cfg(feature = "polars")]
    pub fn to_polars(&self) -> FastExcelResult<DataFrame> {
        use crate::error::FastExcelErrorKind;

        let pl_columns = self.to_columns()?.into_iter().map(Into::into).collect();
        DataFrame::new_infer_height(pl_columns).map_err(|err| {
            FastExcelErrorKind::Internal(format!("could not create DataFrame: {err:?}")).into()
        })
    }
}


================================================
FILE: src/types/exceltable/python.rs
================================================
use std::sync::Arc;

use arrow_array::{RecordBatch, StructArray};
use arrow_schema::Field;
#[cfg(feature = "pyarrow")]
use pyo3::PyAny;
use pyo3::{
    Bound, PyResult, Python, pymethods,
    types::{PyCapsule, PyTuple},
};
use pyo3_arrow::ffi::{to_array_pycapsules, to_schema_pycapsule};

use crate::{
    ExcelTable,
    data::{record_batch_from_data_and_columns_with_skip_rows, selected_columns_to_schema},
    error::{ErrorContext, FastExcelError, FastExcelResult, py_errors::IntoPyResult},
    types::{dtype::DTypes, excelsheet::column_info::ColumnInfo},
};

impl TryFrom<&ExcelTable> for RecordBatch {
    type Error = FastExcelError;

    fn try_from(table: &ExcelTable) -> FastExcelResult<Self> {
        record_batch_from_data_and_columns_with_skip_rows(
            &table.selected_columns,
            table.data(),
            table.pagination.skip_rows(),
            table.offset(),
            table.limit(),
            table.opts.whitespace_as_null,
        )
        .with_context(|| {
            format!(
                "could not convert table {table} in sheet {sheet} to RecordBatch",
                table = &table.name,
                sheet = &table.sheet_name
            )
        })
    }
}

// NOTE: These proxy python implems are required because `#[getter]` does not play well with `cfg_attr`:
// * https://github.com/PyO3/pyo3/issues/1003
// * https://github.com/PyO3/pyo3/issues/780
#[pymethods]
impl ExcelTable {
    #[getter("name")]
    pub fn py_name(&self) -> &str {
        &self.name
    }

    #[getter("sheet_name")]
    pub fn py_sheet_name(&self) -> &str {
        &self.sheet_name
    }

    #[getter("offset")]
    pub fn py_offset(&self) -> usize {
        self.offset()
    }

    #[getter("limit")]
    pub fn py_limit(&self) -> usize {
        self.limit()
    }

    #[getter("selected_columns")]
    pub fn py_selected_columns(&self) -> Vec<ColumnInfo> {
        self.selected_columns()
    }

    #[pyo3(name = "available_columns")]
    pub fn py_available_columns(&mut self) -> FastExcelResult<Vec<ColumnInfo>> {
        self.available_columns()
    }

    #[getter("specified_dtypes")]
    pub fn py_specified_dtypes(&self) -> Option<&DTypes> {
        self.specified_dtypes()
    }

    #[getter("width")]
    pub fn py_width(&mut self) -> usize {
        self.width()
    }

    #[getter("height")]
    pub fn py_height(&mut self) -> usize {
        self.height()
    }

    #[getter("total_height")]
    pub fn py_total_height(&mut self) -> usize {
        self.total_height()
    }

    #[cfg(feature = "pyarrow")]
    pub fn to_arrow<'py>(&self, py: Python<'py>) -> FastExcelResult<Bound<'py, PyAny>> {
        RecordBatch::try_from(self)
            .with_context(|| {
                format!(
                    "could not create RecordBatch from sheet \"{}\"",
                    self.name
                )
            })
            .and_then(|rb| {
                use arrow_pyarrow::ToPyArrow;

                use crate::error::FastExcelErrorKind;

                rb.to_pyarrow(py)
                    .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into())
            })
            .with_context(|| {
                format!(
                    "could not convert RecordBatch to pyarrow for table \"{table}\" in sheet \"{sheet}\"",
                    table = self.name, sheet = self.sheet_name
                )
            })
    }

    /// Export the schema as an [`ArrowSchema`] [`PyCapsule`].
    ///
    /// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export>
    ///
    /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
    /// [`PyCapsule`]: pyo3::types::PyCapsule
    pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyCapsule>> {
        let schema = selected_columns_to_schema(&self.selected_columns);
        Ok(to_schema_pycapsule(py, &schema)?)
    }

    /// Export the schema and data as a pair of [`ArrowSchema`] and [`ArrowArray`] [`PyCapsules`]
    ///
    /// The optional `requested_schema` parameter allows for potential schema conversion.
    ///
    /// <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export>
    ///
    /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema
    /// [`ArrowArray`]: arrow_array::ffi::FFI_ArrowArray
    /// [`PyCapsules`]: pyo3::types::PyCapsule
    pub fn __arrow_c_array__<'py>(
        &self,
        py: Python<'py>,
        requested_schema: Option<Bound<'py, PyCapsule>>,
    ) -> PyResult<Bound<'py, PyTuple>> {
        let record_batch = RecordBatch::try_from(self)
            .with_context(|| format!("could not create RecordBatch from table \"{}\"", self.name))
            .into_pyresult()?;

        let field = Field::new_struct("", record_batch.schema_ref().fields().clone(), false);
        let array = Arc::new(StructArray::from(record_batch));
        Ok(to_array_pycapsules(
            py,
            field.into(),
            array.as_ref(),
            requested_schema,
        )?)
    }

    pub fn __repr__(&self) -> String {
        format!(
            "ExcelTable<{sheet}/{name}>",
            sheet = self.sheet_name,
            name = self.name
        )
    }
}


================================================
FILE: src/types/idx_or_name/mod.rs
================================================
#[cfg(feature = "python")]
mod python;

/// A column index or name.
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum IdxOrName {
    Idx(usize),
    Name(String),
}

impl IdxOrName {
    pub(crate) fn format_message(&self) -> String {
        match self {
            Self::Idx(idx) => format!("at index {idx}"),
            Self::Name(name) => format!("with name \"{name}\""),
        }
    }
}

impl From<usize> for IdxOrName {
    fn from(index: usize) -> Self {
        Self::Idx(index)
    }
}

impl From<String> for IdxOrName {
    fn from(name: String) -> Self {
        Self::Name(name)
    }
}

impl From<&str> for IdxOrName {
    fn from(name: &str) -> Self {
        Self::Name(name.to_owned())
    }
}


================================================
FILE: src/types/idx_or_name/python.rs
================================================
use pyo3::{
    Borrowed, Bound, FromPyObject, IntoPyObject, IntoPyObjectExt, PyAny, PyErr, Python,
    types::PyAnyMethods,
};

use crate::{
    error::{FastExcelError, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult},
    types::idx_or_name::IdxOrName,
};

impl TryFrom<&Bound<'_, PyAny>> for IdxOrName {
    type Error = FastExcelError;

    fn try_from(value: &Bound<'_, PyAny>) -> FastExcelResult<Self> {
        if let Ok(index) = value.extract() {
            Ok(Self::Idx(index))
        } else if let Ok(name) = value.extract() {
            Ok(Self::Name(name))
        } else {
            Err(FastExcelErrorKind::InvalidParameters(format!(
                "cannot create IdxOrName from {value:?}"
            ))
            .into())
        }
    }
}

impl<'a, 'py> FromPyObject<'a, 'py> for IdxOrName {
    type Error = PyErr;
    fn extract(ob: Borrowed<'a, 'py, PyAny>) -> Result<Self, Self::Error> {
        (&*ob).try_into().into_pyresult()
    }
}

impl<'py> IntoPyObject<'py> for IdxOrName {
    type Target = PyAny;

    type Output = Bound<'py, Self::Target>;

    type Error = pyo3::PyErr;

    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
        match self {
            IdxOrName::Idx(idx) => idx.into_bound_py_any(py),
            IdxOrName::Name(name) => name.into_bound_py_any(py),
        }
    }
}

impl<'py> IntoPyObject<'py> for &IdxOrName {
    type Target = PyAny;

    type Output = Bound<'py, Self::Target>;

    type Error = pyo3::PyErr;

    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
        match self {
            IdxOrName::Idx(idx) => idx.into_bound_py_any(py),
            IdxOrName::Name(name) => name.into_bound_py_any(py),
        }
    }
}


================================================
FILE: src/types/mod.rs
================================================
pub(crate) mod dtype;
pub(crate) mod excelreader;
pub(crate) mod excelsheet;
pub(crate) mod exceltable;
pub(crate) mod idx_or_name;

pub use dtype::{DType, DTypeCoercion, DTypes};
pub use excelreader::{DefinedName, ExcelReader, LoadSheetOrTableOptions};
pub use excelsheet::{
    ExcelSheet, SelectedColumns, SheetVisible, SkipRows,
    column_info::{ColumnInfo, ColumnNameFrom, DTypeFrom},
};
pub use exceltable::ExcelTable;
pub use idx_or_name::IdxOrName;


================================================
FILE: src/utils/mod.rs
================================================
pub(crate) mod schema;


================================================
FILE: src/utils/schema.rs
================================================
use std::cmp::min;

/// Determines how many rows should be used for schema sampling, based on the provided parameter,
/// and the sheet's offset and limit.
///
/// Note that here, the limit should be retrieved from the sheet's `limit()` method, and must not
/// be out of the sheet's bounds
pub(crate) fn get_schema_sample_rows(
    sample_rows: Option<usize>,
    offset: usize,
    limit: usize,
) -> usize {
    // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is
    // not provided, we sample limit rows, i.e on the entire column
    let sample_rows = offset + sample_rows.unwrap_or(limit);
    // If sample_rows is higher than the sheet's limit, use the limit instead
    min(sample_rows, limit)
}

#[cfg(feature = "__pyo3-tests")]
#[cfg(test)]
mod tests {
    use super::get_schema_sample_rows;
    use pretty_assertions::assert_eq;
    use rstest::rstest;

    #[rstest]
    // default value, 50 rows sheet, row limit should be 50
    #[case(Some(1000), 0, 50, 50)]
    // default value, 5000 rows sheet, row limit should be 1000
    #[case(Some(1000), 0, 5000, 1000)]
    // default value, 1500 rows sheet, offset of 1000, row limit should be 1500
    #[case(Some(1000), 1000, 1500, 1500)]
    // 100 sampling size, 1500 rows sheet, offset of 1000, row limit should be 1100
    #[case(Some(100), 1000, 1500, 1100)]
    // No value, 50 rows sheet, row limit should be 50
    #[case(None, 0, 50, 50)]
    // No value, 5000 rows sheet, row limit should be 5000
    #[case(None, 0, 5000, 5000)]
    // no value, 1500 rows sheet, offset of 1000, row limit should be 1500
    #[case(None, 1000, 1500, 1500)]
    fn test_get_schema_sample_rows_return_values(
        #[case] sample_rows: Option<usize>,
        #[case] offset: usize,
        #[case] limit: usize,
        #[case] expected: usize,
    ) {
        assert_eq!(get_schema_sample_rows(sample_rows, offset, limit), expected);
    }
}


================================================
FILE: test.py
================================================
#!/usr/bin/env python3
import argparse

import fastexcel


def get_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("file")
    parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use")
    parser.add_argument(
        "--eager", action="store_true", help="wether the sheet should be loaded eagerly"
    )
    parser.add_argument(
        "-i", "--iterations", type=int, help="the number of iterations to do", default=1
    )
    parser.add_argument("-t", "--table", type=str, help="the name of the table to load")
    parser.add_argument(
        "--print-tables", action="store_true", help="whether to print the tables in the file"
    )

    return parser.parse_args()


def main():
    args = get_args()
    excel_file = fastexcel.read_excel(args.file)
    use_columns = args.column or None

    if args.print_tables:
        table_names = excel_file.table_names()
        if len(table_names) > 0:
            print(f"Available tables are {', '.join(table_names)}")
        else:
            print("No tables found")

    for _ in range(args.iterations):
        if args.table:
            tbl = excel_file.load_table(args.table)
            print(f"Found table {args.table}:")
            print(tbl.to_polars())
        else:
            for sheet_name in excel_file.sheet_names:
                if args.eager:
                    excel_file.load_sheet_eager(sheet_name, use_columns=use_columns)
                else:
                    excel_file.load_sheet(sheet_name, use_columns=use_columns).to_arrow()


if __name__ == "__main__":
    main()


================================================
FILE: tests/column_selection.rs
================================================
use anyhow::{Context, Result};
use fastexcel::{DType, DTypes, IdxOrName, LoadSheetOrTableOptions, SelectedColumns};
use pretty_assertions::assert_eq;
use rstest::{fixture, rstest};
use std::collections::HashMap;

use crate::utils::path_for_fixture;

#[macro_use]
mod utils;

#[fixture]
fn reader() -> fastexcel::ExcelReader {
    fastexcel::read_excel(path_for_fixture("sheet-with-tables.xlsx"))
        .expect("could not read excel file")
}

#[rstest]
fn test_use_columns_with_table(mut reader: fastexcel::ExcelReader) -> Result<()> {
    let selected_columns = SelectedColumns::Selection(vec![
        IdxOrName::Name("User Id".to_string()),
        IdxOrName::Name("FirstName".to_string()),
    ]);

    let opts = LoadSheetOrTableOptions::new_for_table().selected_columns(selected_columns);

    let mut table = reader
        .load_table("users", opts)
        .context("Failed to load table")?;

    assert_eq!(table.name(), "users");
    assert_eq!(table.width(), 4);
    assert_eq!(table.height(), 3);

    let available_columns = table
        .available_columns()
        .context("could not obtain available columns for table")?;
    let expected_available_columns = vec![
        fastexcel::ColumnInfo {
            name: "User Id".into(),
            index: 0,
            absolute_index: 0,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "FirstName".into(),
            index: 1,
            absolute_index: 1,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__2".into(),
            index: 2,
            absolute_index: 2,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__3".into(),
            index: 3,
            absolute_index: 3,
            dtype: fastexcel::DType::DateTime,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(available_columns, expected_available_columns);

    let selected_columns_info = table.selected_columns();
    let expected_selected_columns = vec![
        fastexcel::ColumnInfo {
            name: "User Id".into(),
            index: 0,
            absolute_index: 0,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "FirstName".into(),
            index: 1,
            absolute_index: 1,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(selected_columns_info, expected_selected_columns);

    let expected_columns = fe_columns!(
        "User Id" => [1.0, 2.0, 5.0],
        "FirstName" => ["Peter", "John", "Hans"],
    );

    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns);

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "User Id" => [1.0, 2.0, 5.0],
            "FirstName" => ["Peter", "John", "Hans"],
        )?;

        let df = table
            .to_polars()
            .context("could not convert table to polars dataframe")?;
        assert!(df.equals_missing(&expected_df))
    }

    Ok(())
}

#[rstest]
fn test_use_columns_with_table_and_provided_columns(
    mut reader: fastexcel::ExcelReader,
) -> Result<()> {
    let selected_columns = SelectedColumns::Selection(vec![0.into(), 2.into()]);

    let opts = LoadSheetOrTableOptions::new_for_table()
        .column_names(vec!["user_id", "last_name"])
        .selected_columns(selected_columns);

    let mut table = reader
        .load_table("users", opts)
        .context("Failed to load table")?;

    assert_eq!(table.name(), "users");
    assert_eq!(table.width(), 4);
    assert_eq!(table.height(), 3);

    let available_columns = table
        .available_columns()
        .context("could not obtain available columns for table")?;
    let expected_available_columns = vec![
        fastexcel::ColumnInfo {
            name: "user_id".into(),
            index: 0,
            absolute_index: 0,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__1".into(),
            index: 1,
            absolute_index: 1,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "last_name".into(),
            index: 2,
            absolute_index: 2,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__3".into(),
            index: 3,
            absolute_index: 3,
            dtype: fastexcel::DType::DateTime,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(available_columns, expected_available_columns);

    let selected_columns_info = table.selected_columns();
    let expected_selected_columns = vec![
        fastexcel::ColumnInfo {
            name: "user_id".into(),
            index: 0,
            absolute_index: 0,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "last_name".into(),
            index: 2,
            absolute_index: 2,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(selected_columns_info, expected_selected_columns);

    let expected_columns = fe_columns!(
        "user_id" => [1.0, 2.0, 5.0],
        "last_name" => ["Müller", "Meier", "Fricker"],
    );

    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns);

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "user_id" => [1.0, 2.0, 5.0],
            "last_name" => ["Müller", "Meier", "Fricker"],
        )?;

        let df = table
            .to_polars()
            .context("could not convert table to polars dataframe")?;
        assert!(df.equals_missing(&expected_df))
    }

    Ok(())
}

#[fixture]
fn reader_with_offset() -> fastexcel::ExcelReader {
    fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
        .expect("could not read excel file")
}

#[rstest]
fn test_use_column_range_with_offset_with_table_and_specified_dtypes(
    mut reader_with_offset: fastexcel::ExcelReader,
) -> Result<()> {
    let dtypes_map: HashMap<IdxOrName, DType> = [
        (IdxOrName::Idx(3), DType::Int),
        (IdxOrName::Name("Column at E5".to_owned()), DType::String),
    ]
    .into_iter()
    .collect();

    let selected_columns_closed = "D:E"
        .parse::<SelectedColumns>()
        .context("could not parse column selection")?;

    let opts_closed_range = LoadSheetOrTableOptions::new_for_table()
        .selected_columns(selected_columns_closed)
        .with_dtypes(DTypes::Map(dtypes_map.clone()));

    let table_closed = reader_with_offset
        .load_table("TableAtD5", opts_closed_range)
        .context("Failed to load table with closed range")?;

    let selected_columns_open_ended = "D:"
        .parse::<SelectedColumns>()
        .context("could not parse column selection")?;

    let opts_open_ended_range = LoadSheetOrTableOptions::new_for_table()
        .selected_columns(selected_columns_open_ended)
        .with_dtypes(DTypes::Map(dtypes_map.clone()));

    let table_open_ended = reader_with_offset
        .load_table("TableAtD5", opts_open_ended_range)
        .context("Failed to load table with open-ended range")?;

    assert_eq!(table_closed.name(), "TableAtD5");
    assert_eq!(table_open_ended.name(), "TableAtD5");

    let expected_selected_columns = vec![
        fastexcel::ColumnInfo {
            name: "Column at D5".to_owned(),
            index: 0,
            absolute_index: 3,
            dtype: fastexcel::DType::Int,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::ProvidedByIndex,
        },
        fastexcel::ColumnInfo {
            name: "Column at E5".to_owned(),
            index: 1,
            absolute_index: 4,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::ProvidedByName,
        },
    ];
    assert_eq!(table_closed.selected_columns(), expected_selected_columns);
    assert_eq!(
        table_open_ended.selected_columns(),
        expected_selected_columns
    );

    let expected_columns = fe_columns!(
        "Column at D5" => [1_i64, 2, 3, 4],
        "Column at E5" => ["4", "5", "6", "8"],
    );

    assert_eq!(
        table_closed
            .to_columns()
            .context("could not convert table to columns")?,
        expected_columns
    );

    assert_eq!(
        table_open_ended
            .to_columns()
            .context("could not convert table to columns")?,
        expected_columns
    );

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "Column at D5" => [1_i64, 2, 3, 4],
            "Column at E5" => ["4", "5", "6", "8"],
        )?;

        let df_closed = table_closed
            .to_polars()
            .context("could not convert table to polars dataframe")?;
        let df_open_ended = table_open_ended
            .to_polars()
            .context("could not convert table to polars dataframe")?;

        assert!(df_closed.equals_missing(&expected_df));
        assert!(df_open_ended.equals_missing(&expected_df));
    }

    Ok(())
}

/// This test ensures that index-based selection is correctly resolved when used with an offset
/// table: the selected indices should be absolute, and it should be able to handle both index-based
/// and name-based selection.
#[rstest]
fn test_use_column_names_with_offset_table_by_index_and_name(
    mut reader_with_offset: fastexcel::ExcelReader,
) -> Result<()> {
    let selected_columns = SelectedColumns::Selection(vec![
        IdxOrName::Name("Column at D5".to_string()),
        IdxOrName::Idx(4),
    ]);

    let opts = LoadSheetOrTableOptions::new_for_table().selected_columns(selected_columns);

    let table = reader_with_offset
        .load_table("TableAtD5", opts)
        .context("Failed to load table")?;

    assert_eq!(table.name(), "TableAtD5");

    let expected_selected_columns = vec![
        fastexcel::ColumnInfo {
            name: "Column at D5".to_owned(),
            index: 0,
            absolute_index: 3,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "Column at E5".to_owned(),
            index: 1,
            absolute_index: 4,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];

    let selected_columns_info = table.selected_columns();
    assert_eq!(selected_columns_info, expected_selected_columns);

    let expected_columns = fe_columns!(
        "Column at D5" => [1.0, 2.0, 3.0, 4.0],
        "Column at E5" => [4.0, 5.0, 6.0, 8.0],
    );

    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns);

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "Column at D5" => [1.0, 2.0, 3.0, 4.0],
            "Column at E5" => [4.0, 5.0, 6.0, 8.0],
        )?;

        let df = table
            .to_polars()
            .context("could not convert table to polars dataframe")?;
        assert!(df.equals_missing(&expected_df))
    }

    Ok(())
}

#[rstest]
fn test_use_column_range_with_offset_with_sheet_and_specified_dtypes(
    mut reader_with_offset: fastexcel::ExcelReader,
) -> Result<()> {
    // Create dtypes map: {7: "int", "Column at I10": "string"}
    // Note: Column H is at index 7, Column I is at index 8, Column K is at index 10
    let dtypes_map: HashMap<IdxOrName, DType> = [
        (IdxOrName::Idx(7), DType::Int),
        (IdxOrName::Name("Column at I10".to_owned()), DType::String),
    ]
    .into_iter()
    .collect();

    let selected_columns_closed = "H:K"
        .parse::<SelectedColumns>()
        .context("could not parse column selection")?;

    let opts_closed_range = LoadSheetOrTableOptions::new_for_sheet()
        .header_row(9)
        .selected_columns(selected_columns_closed)
        .with_dtypes(DTypes::Map(dtypes_map.clone()));

    let sheet_closed = reader_with_offset
        .load_sheet("without-table".into(), opts_closed_range)
        .context("Failed to load sheet with closed range")?;

    let selected_columns_open_ended = "H:"
        .parse::<SelectedColumns>()
        .context("could not parse column selection")?;

    let opts_open_ended_range = LoadSheetOrTableOptions::new_for_sheet()
        .header_row(9)
        .selected_columns(selected_columns_open_ended)
        .with_dtypes(DTypes::Map(dtypes_map.clone()));

    let sheet_open_ended = reader_with_offset
        .load_sheet("without-table".into(), opts_open_ended_range)
        .context("Failed to load sheet with open-ended range")?;

    assert_eq!(sheet_closed.name(), "without-table");
    assert_eq!(sheet_open_ended.name(), "without-table");

    let expected_selected_columns = vec![
        fastexcel::ColumnInfo {
            name: "Column at H10".to_owned(),
            index: 0,
            absolute_index: 7,
            dtype: fastexcel::DType::Int,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
            dtype_from: fastexcel::DTypeFrom::ProvidedByIndex,
        },
        fastexcel::ColumnInfo {
            name: "Column at I10".to_owned(),
            index: 1,
            absolute_index: 8,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
            dtype_from: fastexcel::DTypeFrom::ProvidedByName,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__2".to_owned(),
            index: 2,
            absolute_index: 9,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "Column at K10".to_owned(),
            index: 3,
            absolute_index: 10,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(sheet_closed.selected_columns(), &expected_selected_columns);
    assert_eq!(
        sheet_open_ended.selected_columns(),
        &expected_selected_columns
    );

    let expected_columns = fe_columns!(
        "Column at H10" => [1_i64, 2, 3],
        "Column at I10" => ["4", "5", "6"],
        "__UNNAMED__2" => [Option::<&str>::None, None, None],
        "Column at K10" => [7.0, 8.0, 9.0],
    );

    assert_eq!(
        sheet_closed
            .to_columns()
            .context("could not convert sheet to columns")?,
        expected_columns
    );

    assert_eq!(
        sheet_open_ended
            .to_columns()
            .context("could not convert sheet to columns")?,
        expected_columns
    );

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "Column at H10" => [1_i64, 2, 3],
            "Column at I10" => ["4", "5", "6"],
            "__UNNAMED__2" => [Option::<&str>::None, None, None],
            "Column at K10" => [7.0, 8.0, 9.0],
        )?;

        let df_closed = sheet_closed
            .to_polars()
            .context("could not convert sheet to polars dataframe")?;
        let df_open_ended = sheet_open_ended
            .to_polars()
            .context("could not convert sheet to polars dataframe")?;

        assert!(df_closed.equals_missing(&expected_df));
        assert!(df_open_ended.equals_missing(&expected_df));
    }

    Ok(())
}


================================================
FILE: tests/fastexcel.rs
================================================
#[macro_use]
mod utils;

use anyhow::{Context, Result};
use chrono::NaiveDate;
use fastexcel::{FastExcelColumn, LoadSheetOrTableOptions, SkipRows};
#[cfg(feature = "polars")]
use polars_core::{df, frame::DataFrame};
use pretty_assertions::assert_eq;
use rstest::rstest;
use utils::path_for_fixture;

#[test]
fn test_single_sheet() -> Result<()> {
    let mut reader = fastexcel::read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
        .context("could not read excel file")?;

    assert_eq!(reader.sheet_names(), vec!["January"]);
    let mut sheet_by_name = reader
        .load_sheet("January".into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet by name")?;
    let mut sheet_by_idx = reader
        .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet by index")?;

    assert_eq!(sheet_by_name.name(), sheet_by_idx.name());
    assert_eq!(sheet_by_name.name(), "January");

    assert_eq!(sheet_by_name.height(), sheet_by_idx.height());
    assert_eq!(sheet_by_name.height(), 2);

    assert_eq!(sheet_by_name.width(), sheet_by_idx.width());
    assert_eq!(sheet_by_name.width(), 2);

    let columns_by_name = sheet_by_name
        .to_columns()
        .context("could not convert sheet by name to columns")?;
    let columns_by_idx = sheet_by_idx
        .to_columns()
        .context("could not convert sheet by index to columns")?;

    assert_eq!(&columns_by_name, &columns_by_idx);
    let expected_columns = fe_columns!(
        "Month" => [1.0, 2.0],
        "Year" => [2019.0, 2020.0],
    );
    assert_eq!(&columns_by_name, &expected_columns);

    #[cfg(feature = "polars")]
    {
        let df_by_name = sheet_by_name
            .to_polars()
            .context("could not convert sheet by name to DataFrame")?;
        let df_by_idx = sheet_by_idx
            .to_polars()
            .context("could not convert sheet by index to DataFrame")?;
        let expected_df = df!(
            "Month" => [1.0, 2.0],
            "Year" => [2019.0, 2020.0]
        )
        .context("could not create expected DataFrame")?;
        assert_eq!(&df_by_name, &df_by_idx);
        assert!(df_by_name.equals_missing(&expected_df));
    }

    Ok(())
}

#[test]
fn test_single_sheet_bytes() -> Result<()> {
    let bytes = std::fs::read(path_for_fixture("fixture-single-sheet.xlsx"))?;

    let mut reader = fastexcel::ExcelReader::try_from(bytes.as_slice())
        .context("could not create reader from bytes")?;

    assert_eq!(reader.sheet_names(), vec!["January"]);
    let mut sheet_by_name = reader
        .load_sheet("January".into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet by name")?;
    let mut sheet_by_idx = reader
        .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet by index")?;

    assert_eq!(sheet_by_name.name(), sheet_by_idx.name());
    assert_eq!(sheet_by_name.name(), "January");

    assert_eq!(sheet_by_name.height(), sheet_by_idx.height());
    assert_eq!(sheet_by_name.height(), 2);

    assert_eq!(sheet_by_name.width(), sheet_by_idx.width());
    assert_eq!(sheet_by_name.width(), 2);

    let columns_by_name = sheet_by_name
        .to_columns()
        .context("could not convert sheet by name to columns")?;
    let columns_by_idx = sheet_by_idx
        .to_columns()
        .context("could not convert sheet by index to columns")?;

    assert_eq!(&columns_by_name, &columns_by_idx);
    let expected_columns = fe_columns!(
        "Month" => [1.0, 2.0],
        "Year" => [2019.0, 2020.0]
    );
    assert_eq!(&columns_by_name, &expected_columns);

    #[cfg(feature = "polars")]
    {
        let df_by_name = sheet_by_name
            .to_polars()
            .context("could not convert sheet by name to DataFrame")?;
        let df_by_idx = sheet_by_idx
            .to_polars()
            .context("could not convert sheet by index to DataFrame")?;
        let expected_df = df!(
            "Month" => [1.0, 2.0],
            "Year" => [2019.0, 2020.0]
        )
        .context("could not create expected DataFrame")?;
        assert_eq!(&df_by_name, &df_by_idx);
        assert!(df_by_name.equals_missing(&expected_df));
    }

    Ok(())
}

#[test]
fn test_single_sheet_with_types() -> Result<()> {
    let mut excel_reader =
        fastexcel::read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
            .context("could not read excel file")?;

    let mut sheet = excel_reader
        .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet")?;

    assert_eq!(sheet.name(), "Sheet1");
    assert_eq!(sheet.height(), sheet.total_height());
    assert_eq!(sheet.height(), 3);
    assert_eq!(sheet.width(), 4);

    let columns = sheet
        .to_columns()
        .context("could not convert sheet by name to columns")?;

    let naive_date = NaiveDate::from_ymd_opt(2022, 3, 2)
        .unwrap()
        .and_hms_opt(5, 43, 4)
        .unwrap();

    let expected_columns = fe_columns!(
        "__UNNAMED__0" => [0.0, 1.0, 2.0],
        "bools" => [true, false, true],
        "dates" => [naive_date; 3],
        "floats" => [12.35, 42.69, 1234567.0],
    );
    assert_eq!(&columns, &expected_columns);

    #[cfg(feature = "polars")]
    {
        let df = sheet
            .to_polars()
            .context("could not convert sheet to DataFrame")?;
        let expected_df = df!(
            "__UNNAMED__0" => [0.0, 1.0, 2.0],
            "bools" => [true, false, true],
            "dates" => [naive_date; 3],
            "floats" => [12.35, 42.69, 1234567.0],
        )
        .context("could not create expected DataFrame")?;

        assert!(df.equals_missing(&expected_df));
    }

    Ok(())
}

#[test]
fn test_multiple_sheets() -> Result<()> {
    let mut excel_reader = fastexcel::read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
        .context("could not read excel file")?;

    let sheet_0 = excel_reader
        .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet 0 by idx")?;
    let expected_columns_sheet_0 = fe_columns!("Month" => [1.0], "Year" => [2019.0]);
    let sheet_0_columns = sheet_0
        .to_columns()
        .context("could not convert sheet 0 to columns")?;
    assert_eq!(sheet_0_columns, expected_columns_sheet_0);

    let sheet_1 = excel_reader
        .load_sheet(1.into(), LoadSheetOrTableOptions::new_for_sheet())
        .context("could not load sheet 1 by idx")?;
    let expected_columns_sheet_1 =
        fe_columns!("Month" => [2.0, 3.0, 4.0], "Year" => [2019.0, 2021.0, 2022.0]);
    let sheet_1_columns = sheet_1
        .to_columns()
        .context("could not convert sheet 1 to columns")?;
    assert_eq!(sheet_1_columns, expected_columns_sheet_1);

    let sheet_unnamed_columns = excel_reader
        .load_sheet(
            "With unnamed columns".into(),
            LoadSheetOrTableOptions::new_for_sheet(),
        )
        .context("could not load sheet \"With unnamed columns\" by idx")?;
    let expected_columns_sheet_unnamed_columns = fe_columns!(
        "col1" => [2.0, 3.0],
        "__UNNAMED__1" => [1.5, 2.5],
        "col3" => ["hello", "world"],
        "__UNNAMED__3" => [-5.0, -6.0],
        "col5" => ["a", "b"],
    );
    let sheet_unnamed_columns_columns = sheet_unnamed_columns
        .to_columns()
        .context("could not convert sheet \"With unnamed columns\" to columns")?;

    assert_eq!(
        sheet_unnamed_columns_columns,
        expected_columns_sheet_unnamed_columns
    );

    #[cfg(feature = "polars")]
    {
        let expected_df_sheet_0 = df!("Month" => [1.0], "Year" => [2019.0])?;
        let df_sheet_0 = sheet_0
            .to_polars()
            .context("could not convert sheet 0 to DataFrame")?;
        assert!(expected_df_sheet_0.equals_missing(&df_sheet_0));

        let expected_df_sheet_1 =
            df!("Month" => [2.0, 3.0, 4.0], "Year" => [2019.0, 2021.0, 2022.0])?;
        let df_sheet_1 = sheet_1
            .to_polars()
            .context("could not convert sheet 1 to DataFrame")?;
        assert!(expected_df_sheet_1.equals_missing(&df_sheet_1));

        let expected_df_sheet_unnamed_columns = df!(
            "col1" => [2.0, 3.0],
            "__UNNAMED__1" => [1.5, 2.5],
            "col3" => ["hello", "world"],
            "__UNNAMED__3" => [-5.0, -6.0],
            "col5" => ["a", "b"],
        )?;
        let df_sheet_unnamed_columns = sheet_unnamed_columns
            .to_polars()
            .context("could not convert sheet \"With unnamed columns\" to DataFrame")?;
        assert!(expected_df_sheet_unnamed_columns.equals_missing(&df_sheet_unnamed_columns));
    }

    Ok(())
}

#[test]
fn test_sheet_with_header_row_diff_from_zero() -> Result<()> {
    let mut excel_reader =
        fastexcel::read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
            .context("could not read excel file")?;

    assert_eq!(
        excel_reader.sheet_names(),
        vec!["Sheet1", "Sheet2", "Sheet3"]
    );

    let mut sheet_by_name = excel_reader
        .load_sheet(
            "Sheet1".into(),
            LoadSheetOrTableOptions::new_for_sheet().header_row(1),
        )
        .context("could not load sheet \"Sheet1\" by name")?;

    let mut sheet_by_idx = excel_reader
        .load_sheet(
            0.into(),
            LoadSheetOrTableOptions::new_for_sheet().header_row(1),
        )
        .context("could not load sheet 0 by index")?;

    assert_eq!(sheet_by_name.name(), sheet_by_idx.name());
    assert_eq!(sheet_by_name.name(), "Sheet1");

    assert_eq!(sheet_by_name.height(), sheet_by_idx.height());
    assert_eq!(sheet_by_name.height(), 2);

    assert_eq!(sheet_by_name.width(), sheet_by_idx.width());
    assert_eq!(sheet_by_name.width(), 2);

    let expected_columns = fe_columns!(
        "Month" => [1.0, 2.0],
        "Year" => [2019.0, 2020.0]
    );

    let columns_by_name = sheet_by_name
        .to_columns()
        .context("could not convert sheet \"Sheet1\" to columns")?;
    let columns_by_idx = sheet_by_idx
        .to_columns()
        .context("could not convert sheet 0 to columns")?;
    assert_eq!(&columns_by_name, &columns_by_idx);
    assert_eq!(&columns_by_name, &expected_columns);

    #[cfg(feature = "polars")]
    {
        let df_by_name = sheet_by_name
            .to_polars()
            .context("could not convert sheet \"Sheet1\" to DataFrame")?;
        let df_by_idx = sheet_by_idx
            .to_polars()
            .context("could not convert sheet 0 to DataFrame")?;
        let expected_df = df!(
            "Month" => [1.0, 2.0],
            "Year" => [2019.0, 2020.0]
        )?;

        assert!(df_by_name.equals_missing(&df_by_idx));
        assert!(expected_df.equals_missing(&df_by_name));
    }

    Ok(())
}

#[test]
fn test_sheet_with_pagination_and_without_headers() -> Result<()> {
    let mut excel_reader =
        fastexcel::read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
            .context("could not read excel file")?;

    let opts = LoadSheetOrTableOptions::new_for_sheet()
        .n_rows(1)
        .skip_rows(SkipRows::Simple(1))
        .no_header_row()
        .column_names(["This", "Is", "Amazing", "Stuff"]);
    let mut sheet = excel_reader
        .load_sheet(0.into(), opts)
        .context("could not load sheet 0")?;

    assert_eq!(sheet.name(), "Sheet1");
    assert_eq!(sheet.height(), 1);
    assert_eq!(sheet.width(), 4);

    let naive_dt = NaiveDate::from_ymd_opt(2022, 3, 2)
        .unwrap()
        .and_hms_opt(5, 43, 4)
        .unwrap();

    let expected_columns = fe_columns!(
        "This" => [0.0],
        "Is" => [true],
        "Amazing" => [naive_dt],
        "Stuff" => [12.35],
    );

    let sheet_columns = sheet
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(&sheet_columns, &expected_columns);

    #[cfg(feature = "polars")]
    {
        let df = sheet
            .to_polars()
            .context("could not convert sheet to DataFrame")?;
        let expected_df = df!(
            "This" => [0.0],
            "Is" => [true],
            "Amazing" => [naive_dt],
            "Stuff" => [12.35],
        )?;

        assert!(df.equals_missing(&expected_df));
    }

    Ok(())
}

#[rstest]
#[case(Some(0), SkipRows::SkipEmptyRowsAtBeginning, fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0]))]
#[case(
    None,
    SkipRows::Simple(0),
    fe_columns!(
        "__UNNAMED__0" => [None, None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [None, None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )
)]
#[case(
    None,
    SkipRows::SkipEmptyRowsAtBeginning,
    fe_columns!(
        "__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"],
        "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
    )
)]
#[case(
    Some(0),
    SkipRows::Simple(0),
    fe_columns!(
        "__UNNAMED__0" => [None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )
)]
#[case(
    Some(0),
    SkipRows::Simple(1),
    fe_columns!(
        "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )
)]
#[case(
    None,
    SkipRows::Simple(2),
    fe_columns!(
        "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )
)]
#[case(
    None,
    SkipRows::Simple(3),
    fe_columns!(
        "__UNNAMED__0" => [Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )
)]
#[case(
    Some(1),
    SkipRows::Simple(0),
    fe_columns!("__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
)]
#[case(Some(2), SkipRows::Simple(0), fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0]))]
#[case(
    Some(2),
    SkipRows::SkipEmptyRowsAtBeginning,
    fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])
)]
fn test_header_row_and_skip_rows(
    #[case] header_row: Option<usize>,
    #[case] skip_rows: SkipRows,
    #[case] expected: Vec<FastExcelColumn>,
) -> Result<()> {
    let mut excel_reader = fastexcel::read_excel(path_for_fixture("no-header.xlsx"))
        .context("could not read excel file")?;

    let mut opts = LoadSheetOrTableOptions::new_for_sheet();
    opts.header_row = header_row;
    opts.skip_rows = skip_rows;
    let sheet = excel_reader
        .load_sheet(0.into(), opts)
        .context("could not load sheet 0")?;

    let sheet_columns = sheet
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(&sheet_columns, &expected);
    Ok(())
}

#[cfg(feature = "polars")]
#[rstest]
#[case(Some(0), SkipRows::SkipEmptyRowsAtBeginning, df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])?)]
#[case(
    None,
    SkipRows::Simple(0),
    df!(
        "__UNNAMED__0" => [None, None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [None, None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )?
)]
#[case(
    None,
    SkipRows::SkipEmptyRowsAtBeginning,
    df!(
        "__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"],
        "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
    )?
)]
#[case(
    Some(0),
    SkipRows::Simple(0),
    df!(
        "__UNNAMED__0" => [None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )?
)]
#[case(
    Some(0),
    SkipRows::Simple(1),
    df!(
        "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )?
)]
#[case(
    None,
    SkipRows::Simple(2),
    df!(
        "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )?
)]
#[case(
    None,
    SkipRows::Simple(3),
    df!(
        "__UNNAMED__0" => [Some("b"), Some("c"), Some("d"), Some("e"), Some("f")],
        "__UNNAMED__1" => [Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)]
    )?
)]
#[case(
    Some(1),
    SkipRows::Simple(0),
    df!("__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0])?
)]
#[case(Some(2), SkipRows::Simple(0), df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])?)]
#[case(
    Some(2),
    SkipRows::SkipEmptyRowsAtBeginning,
    df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])?
)]
fn test_header_row_and_skip_rows_polars(
    #[case] header_row: Option<usize>,
    #[case] skip_rows: SkipRows,
    #[case] expected: DataFrame,
) -> Result<()> {
    let mut excel_reader = fastexcel::read_excel(path_for_fixture("no-header.xlsx"))
        .context("could not read excel file")?;

    let mut opts = LoadSheetOrTableOptions::new_for_sheet();
    opts.header_row = header_row;
    opts.skip_rows = skip_rows;

    let sheet = excel_reader
        .load_sheet(0.into(), opts)
        .context("could not load sheet 0")?;

    let df = sheet
        .to_polars()
        .context("could not convert sheet to DataFrame")?;

    assert!(df.equals_missing(&expected));

    Ok(())
}


================================================
FILE: tests/sheet_visibility.rs
================================================
#[allow(unused_macros)]
mod utils;

use anyhow::{Context, Result};
use fastexcel::{LoadSheetOrTableOptions, SheetVisible};
use pretty_assertions::assert_matches;

use crate::utils::path_for_fixture;

#[test]
fn sheet_visibility() -> Result<()> {
    let mut reader = fastexcel::read_excel(path_for_fixture(
        "fixture-sheets-different-visibilities.xlsx",
    ))
    .context("could not read excel file")?;

    let sheet_0 = reader.load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())?;
    let sheet_1 = reader.load_sheet(1.into(), LoadSheetOrTableOptions::new_for_sheet())?;
    let sheet_2 = reader.load_sheet(2.into(), LoadSheetOrTableOptions::new_for_sheet())?;

    assert_matches!(sheet_0.visible(), SheetVisible::Visible);
    assert_matches!(sheet_1.visible(), SheetVisible::Hidden);
    assert_matches!(sheet_2.visible(), SheetVisible::VeryHidden);

    Ok(())
}


================================================
FILE: tests/shifted_data.rs
================================================
#[allow(unused_macros)]
mod utils;

use anyhow::{Context, Result};
use fastexcel::LoadSheetOrTableOptions;
use pretty_assertions::assert_eq;
use utils::path_for_fixture;

#[test]
fn test_sheet_with_offset() -> Result<()> {
    let mut reader = fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
        .context("could not read the excel file")?;
    let mut sheet = reader
        .load_sheet(
            "without-table".into(),
            LoadSheetOrTableOptions::new_for_sheet(),
        )
        .context("could not load sheet \"without-table\"")?;

    let available_columns = sheet
        .available_columns()
        .context("could not obtain available columns for sheet")?;
    let expected_column_info = vec![
        fastexcel::ColumnInfo {
            name: "Column at H10".into(),
            index: 0,
            absolute_index: 7,
            dtype: fastexcel::DType::Float,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
        },
        fastexcel::ColumnInfo {
            name: "Column at I10".into(),
            index: 1,
            absolute_index: 8,
            dtype: fastexcel::DType::Float,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
        },
        fastexcel::ColumnInfo {
            name: "__UNNAMED__2".into(),
            index: 2,
            absolute_index: 9,
            dtype: fastexcel::DType::String,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::Generated,
        },
        fastexcel::ColumnInfo {
            name: "Column at K10".into(),
            index: 3,
            absolute_index: 10,
            dtype: fastexcel::DType::Float,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::LookedUp,
        },
    ];
    assert_eq!(available_columns, expected_column_info);

    Ok(())
}

#[test]
fn test_table_with_offset() -> Result<()> {
    let mut reader = fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
        .context("could not read the excel file")?;
    let mut table = reader
        .load_table("TableAtD5", LoadSheetOrTableOptions::new_for_table())
        .context("could not load table \"TableAtD5\"")?;

    let available_columns = table
        .available_columns()
        .context("could not obtain available columns for table")?;
    let expected_column_info = vec![
        fastexcel::ColumnInfo {
            name: "Column at D5".into(),
            index: 0,
            absolute_index: 3,
            dtype: fastexcel::DType::Float,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
        },
        fastexcel::ColumnInfo {
            name: "Column at E5".into(),
            index: 1,
            absolute_index: 4,
            dtype: fastexcel::DType::Float,
            dtype_from: fastexcel::DTypeFrom::Guessed,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
        },
    ];
    assert_eq!(available_columns, expected_column_info);

    Ok(())
}


================================================
FILE: tests/tables.rs
================================================
use anyhow::{Context, Result};
use chrono::NaiveDate;
use fastexcel::LoadSheetOrTableOptions;
use pretty_assertions::assert_eq;
use rstest::{fixture, rstest};

use crate::utils::path_for_fixture;

#[macro_use]
mod utils;

#[fixture]
fn reader() -> fastexcel::ExcelReader {
    fastexcel::read_excel(path_for_fixture("sheet-with-tables.xlsx"))
        .expect("could not read excel file")
}

#[rstest]
#[case::all_sheets(None, vec!["users"])]
#[case::sheet_with_tables(Some("sheet1"), vec!["users"])]
#[case::sheet_without_tables(Some("sheet2"), vec![])]
fn test_table_names(
    mut reader: fastexcel::ExcelReader,
    #[case] sheet_name: Option<&str>,
    #[case] expected: Vec<&str>,
) -> Result<()> {
    let table_names = reader
        .table_names(sheet_name)
        .context("Failed to get table names")?;
    assert_eq!(table_names, expected);
    Ok(())
}

#[rstest]
fn test_load_table(mut reader: fastexcel::ExcelReader) -> Result<()> {
    let mut table = reader
        .load_table("users", LoadSheetOrTableOptions::new_for_table())
        .context("Failed to load table")?;

    assert_eq!(table.name(), "users");
    assert_eq!(table.sheet_name(), "sheet1");
    assert!(table.specified_dtypes().is_none());
    assert_eq!(table.total_height(), 3);
    assert_eq!(table.offset(), 0);
    assert_eq!(table.height(), 3);
    assert_eq!(table.width(), 4);
    let available_columns = table
        .available_columns()
        .context("could not obtain available columns for table")?;
    let expected_column_info = vec![
        fastexcel::ColumnInfo {
            name: "User Id".into(),
            index: 0,
            absolute_index: 0,
            dtype: fastexcel::DType::Float,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "FirstName".into(),
            index: 1,
            absolute_index: 1,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "LastName".into(),
            index: 2,
            absolute_index: 2,
            dtype: fastexcel::DType::String,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
        fastexcel::ColumnInfo {
            name: "Date".into(),
            index: 3,
            absolute_index: 3,
            dtype: fastexcel::DType::DateTime,
            column_name_from: fastexcel::ColumnNameFrom::Provided,
            dtype_from: fastexcel::DTypeFrom::Guessed,
        },
    ];
    assert_eq!(available_columns, expected_column_info);

    let dates = [
        NaiveDate::from_ymd_opt(2020, 1, 1)
            .unwrap()
            .and_hms_opt(0, 0, 0)
            .unwrap(),
        NaiveDate::from_ymd_opt(2024, 5, 4)
            .unwrap()
            .and_hms_opt(0, 0, 0)
            .unwrap(),
        NaiveDate::from_ymd_opt(2025, 2, 1)
            .unwrap()
            .and_hms_opt(0, 0, 0)
            .unwrap(),
    ];

    let expected_columns = fe_columns!(
        "User Id" => [1.0, 2.0, 5.0],
        "FirstName" => ["Peter", "John", "Hans"],
        "LastName" => ["Müller", "Meier", "Fricker"],
        "Date" => dates.as_slice(),
    );

    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns);

    #[cfg(feature = "polars")]
    {
        use polars_core::df;

        let expected_df = df!(
            "User Id" => [1.0, 2.0, 5.0],
            "FirstName" => ["Peter", "John", "Hans"],
            "LastName" => ["Müller", "Meier", "Fricker"],
            "Date" => dates.as_slice(),
        )?;

        let df = table
            .to_polars()
            .context("could not convert table to polars dataframe")?;
        assert!(df.equals_missing(&expected_df))
    }

    Ok(())
}


================================================
FILE: tests/utils/mod.rs
================================================
pub fn path_for_fixture(fixture_file: &str) -> String {
    format!(
        "{}/tests/fixtures/{}",
        env!("CARGO_MANIFEST_DIR"),
        fixture_file
    )
}

macro_rules! fe_column {
    ($name:expr, $vec_or_arr:expr) => {
        fastexcel::FastExcelColumn::try_new($name.into(), $vec_or_arr.into(), None)
            .context("Failed to create column")
    };
}

macro_rules! fe_columns {
    // (name => []) Any number of times but at least once, optionally followed by a comma
    ($($name:expr => $vec_or_arr:expr),+ $(,)?) => {
        vec![
            $(fe_column!($name, $vec_or_arr)?),+
        ]
    };
}


================================================
FILE: tests/whitespace.rs
================================================
#[macro_use]
mod utils;

use anyhow::{Context, Result};
use chrono::{NaiveDate, NaiveDateTime};
use fastexcel::{ExcelReader, LoadSheetOrTableOptions};
use pretty_assertions::assert_eq;
use rstest::{fixture, rstest};

use crate::utils::path_for_fixture;

#[fixture]
fn reader() -> ExcelReader {
    fastexcel::read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))
        .expect("could not read fixture")
}

const DATES: &[Option<NaiveDateTime>] = &[
    Some(
        NaiveDate::from_ymd_opt(2025, 11, 19)
            .unwrap()
            .and_hms_opt(14, 34, 2)
            .unwrap(),
    ),
    Some(
        NaiveDate::from_ymd_opt(2025, 11, 20)
            .unwrap()
            .and_hms_opt(14, 56, 34)
            .unwrap(),
    ),
    Some(
        NaiveDate::from_ymd_opt(2025, 11, 21)
            .unwrap()
            .and_hms_opt(15, 19, 6)
            .unwrap(),
    ),
    None,
    Some(
        NaiveDate::from_ymd_opt(2025, 11, 22)
            .unwrap()
            .and_hms_opt(15, 41, 38)
            .unwrap(),
    ),
    Some(
        NaiveDate::from_ymd_opt(2025, 11, 23)
            .unwrap()
            .and_hms_opt(16, 4, 10)
            .unwrap(),
    ),
    None,
    None,
    None,
    None,
];

#[rstest]
fn test_skip_tail_rows_behavior(mut reader: ExcelReader) -> Result<()> {
    let expected_columns_with_whitespace = fe_columns!(
        // String because the last row contains a space
        "Column One" => [Some("1"), Some("2"), Some("3"), None, Some("5"), None, None, None, None, Some(" ")],
        "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None, None, Some(""), None, None],
        "Column Three" => DATES,
    );
    let expected_columns_without_whitespace = fe_columns!(
        // Not string rows -> float
        "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None],
        "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None],
        "Column Three" => &DATES[0..6],
    );

    let sheet = reader
        .load_sheet(
            "Without Table".into(),
            LoadSheetOrTableOptions::new_for_sheet(),
        )
        .context(r#"could not load sheet "Without Table""#)?;
    let sheet_columns = sheet
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(sheet_columns, expected_columns_with_whitespace);

    let table = reader
        .load_table(
            "Table_with_whitespace",
            LoadSheetOrTableOptions::new_for_table(),
        )
        .context(r#"could not load table "Table_with_whitespace""#)?;
    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns_with_whitespace);

    let sheet_without_tail_whitespace = reader
        .load_sheet(
            "Without Table".into(),
            LoadSheetOrTableOptions::new_for_sheet().skip_whitespace_tail_rows(true),
        )
        .context(r#"could not load sheet "Without Table""#)?;
    let sheet_without_tail_whitespace_columns = sheet_without_tail_whitespace
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(
        sheet_without_tail_whitespace_columns,
        expected_columns_without_whitespace
    );

    let table_without_tail_whitespace = reader
        .load_table(
            "Table_with_whitespace",
            LoadSheetOrTableOptions::new_for_table().skip_whitespace_tail_rows(true),
        )
        .context(r#"could not load table "Table_with_whitespace""#)?;
    let table_columns_without_tail_whitespace = table_without_tail_whitespace
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(
        table_columns_without_tail_whitespace,
        expected_columns_without_whitespace
    );

    Ok(())
}

#[rstest]
fn test_skip_tail_rows_and_whitespace_as_null_behavior(mut reader: ExcelReader) -> Result<()> {
    let expected_columns_with_whitespace_as_null = fe_columns!(
        // All rows should be taken into account but the space in the last row should be considered null
        "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None, None, None, None, None],
        // All rows should be taken into account but the empty string in 8th row should be considered null
        "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None, None, None, None, None],
        "Column Three" => DATES,
    );
    let expected_columns_without_whitespace = fe_columns!(
        "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None],
        "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None],
        "Column Three" => &DATES[0..6],
    );

    let sheet = reader
        .load_sheet(
            "Without Table".into(),
            LoadSheetOrTableOptions::new_for_sheet().whitespace_as_null(true),
        )
        .context(r#"could not load sheet "Without Table""#)?;
    let sheet_columns = sheet
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(sheet_columns, expected_columns_with_whitespace_as_null);

    let table = reader
        .load_table(
            "Table_with_whitespace",
            LoadSheetOrTableOptions::new_for_table().whitespace_as_null(true),
        )
        .context(r#"could not load table "Table_with_whitespace""#)?;
    let table_columns = table
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(table_columns, expected_columns_with_whitespace_as_null);

    let sheet_without_tail_whitespace = reader
        .load_sheet(
            "Without Table".into(),
            LoadSheetOrTableOptions::new_for_sheet()
                .whitespace_as_null(true)
                .skip_whitespace_tail_rows(true),
        )
        .context(r#"could not load sheet "Without Table""#)?;
    let sheet_without_tail_whitespace_columns = sheet_without_tail_whitespace
        .to_columns()
        .context("could not convert sheet to columns")?;
    assert_eq!(
        sheet_without_tail_whitespace_columns,
        expected_columns_without_whitespace
    );

    let table_without_tail_whitespace = reader
        .load_table(
            "Table_with_whitespace",
            LoadSheetOrTableOptions::new_for_table()
                .whitespace_as_null(true)
                .skip_whitespace_tail_rows(true),
        )
        .context(r#"could not load table "Table_with_whitespace""#)?;
    let table_columns_without_tail_whitespace = table_without_tail_whitespace
        .to_columns()
        .context("could not convert table to columns")?;
    assert_eq!(
        table_columns_without_tail_whitespace,
        expected_columns_without_whitespace
    );

    Ok(())
}