Repository: ToucanToco/fastexcel Branch: main Commit: 98bf33293c85 Files: 99 Total size: 46.9 MB Directory structure: gitextract_ze2pys5u/ ├── .clippy.toml ├── .github/ │ ├── dependabot.yml │ └── workflows/ │ ├── CI.yml │ ├── docs.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── doc-templates/ │ └── module.html.jinja2 ├── pyproject.toml ├── python/ │ ├── fastexcel/ │ │ ├── __init__.py │ │ ├── _fastexcel.pyi │ │ └── py.typed │ └── tests/ │ ├── __init__.py │ ├── benchmarks/ │ │ ├── README.md │ │ ├── fixtures/ │ │ │ ├── formulas.xlsx │ │ │ ├── plain_data.xls │ │ │ └── plain_data.xlsx │ │ ├── memory.py │ │ ├── readers.py │ │ └── speed.py │ ├── conftest.py │ ├── test_alias_generation.py │ ├── test_column_selection.py │ ├── test_defined_names.py │ ├── test_dtypes.py │ ├── test_durations.py │ ├── test_eagerness.py │ ├── test_empty.py │ ├── test_errors.py │ ├── test_fastexcel.py │ ├── test_pycapsule.py │ ├── test_sheet_visibility.py │ ├── test_shifted_data.py │ ├── test_tables.py │ ├── test_whitespace.py │ └── utils.py ├── scripts/ │ └── update_versions.py ├── src/ │ ├── data/ │ │ ├── cell_extractors.rs │ │ ├── mod.rs │ │ ├── python.rs │ │ └── rust.rs │ ├── error.rs │ ├── lib.rs │ ├── types/ │ │ ├── dtype/ │ │ │ ├── mod.rs │ │ │ └── python.rs │ │ ├── excelreader/ │ │ │ ├── mod.rs │ │ │ └── python.rs │ │ ├── excelsheet/ │ │ │ ├── column_info/ │ │ │ │ ├── mod.rs │ │ │ │ └── python.rs │ │ │ ├── mod.rs │ │ │ ├── polars.rs │ │ │ ├── python.rs │ │ │ └── table.rs │ │ ├── exceltable/ │ │ │ ├── mod.rs │ │ │ └── python.rs │ │ ├── idx_or_name/ │ │ │ ├── mod.rs │ │ │ └── python.rs │ │ └── mod.rs │ └── utils/ │ ├── mod.rs │ └── schema.rs ├── test.py └── tests/ ├── column_selection.rs ├── fastexcel.rs ├── fixtures/ │ ├── dates.ods │ ├── decimal-numbers.xlsx │ ├── div0.xlsx │ ├── empty.ods │ ├── empty.xlsx │ ├── fixture-changing-header-location.xlsx │ ├── fixture-invalid-cell-value-num.xlsx │ ├── fixture-invalid-cell-value.xlsx │ ├── fixture-multi-dtypes-columns.xlsx │ ├── fixture-multi-sheet.xlsx │ ├── fixture-sheets-different-visibilities.xlsx │ ├── fixture-single-sheet-duplicated-columns.xlsx │ ├── fixture-single-sheet-with-types.xlsx │ ├── fixture-single-sheet.xlsx │ ├── fixture-type-errors.xlsx │ ├── infer-dtypes-fallback.xlsx │ ├── no-header.xlsx │ ├── null-bytes-in-columns-names.xls │ ├── null-column.xlsx │ ├── sheet-and-table-with-offset.xlsx │ ├── sheet-and-table-with-whitespace.xlsx │ ├── sheet-null-strings-empty.xlsx │ ├── sheet-null-strings.xlsx │ ├── sheet-with-defined-names.xlsx │ ├── sheet-with-na.xlsx │ ├── sheet-with-tables.xlsx │ └── single-sheet-skip-rows-durations.xlsx ├── sheet_visibility.rs ├── shifted_data.rs ├── tables.rs ├── utils/ │ └── mod.rs └── whitespace.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clippy.toml ================================================ disallowed-macros = [ { path = "std::assert_ne", reason = "use `pretty_assertions::assert_ne` instead" }, { path = "std::assert_eq", reason = "use `pretty_assertions::assert_eq` instead" }, { path = "std::assert_matches", reason = "use `pretty_assertions::assert_matches` instead" }, ] ================================================ FILE: .github/dependabot.yml ================================================ version: 2 updates: # python - package-ecosystem: "pip" directory: "/" schedule: interval: "daily" labels: - "dependencies" - ":snake: python :snake:" # rust - package-ecosystem: "cargo" directory: "/" schedule: interval: "daily" groups: prod-deps: dependency-type: "production" dev-deps: dependency-type: "development" labels: - "dependencies" - ":crab: rust :crab:" # actions - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" ================================================ FILE: .github/workflows/CI.yml ================================================ name: CI on: push: branches: - main pull_request: types: [opened, synchronize, reopened] env: MIN_PYTHON_VERSION: "3.10" defaults: run: # Prevents windows runners from running on powershell shell: bash jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: "${{ env.MIN_PYTHON_VERSION }}" - name: Set up rust toolchain uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy - name: Set up rustfmt run: rustup component add rustfmt - name: install uv uses: astral-sh/setup-uv@v7 - name: Install dependencies and lint run: | make install make lint check-docs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.11" - name: Set up rust toolchain uses: dtolnay/rust-toolchain@stable - name: install uv uses: astral-sh/setup-uv@v7 - name: Check documentation run: | make install make doc test: runs-on: ${{ matrix.os }} strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"] os: - "ubuntu-latest" - "ubuntu-24.04-arm" - "macos-14" - "windows-latest" # windows-11-arm excluded: pyarrow is not available for Windows ARM64 # https://github.com/apache/arrow/issues/47195 steps: - uses: actions/checkout@v6 - name: Set up Python uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Set up rust toolchain uses: dtolnay/rust-toolchain@stable - name: install uv uses: astral-sh/setup-uv@v7 - name: Install dependencies and test run: | make install make test - name: Test with pandas<3 run: | uv pip install "pandas<3" make test-python check-wheel-build: runs-on: ${{ matrix.os }} strategy: matrix: # Only testing the build on the smallest supported Python version for abi3 wheels python-version: ["3.10", "3.14t"] os: ["ubuntu-latest", "macos-14", "windows-latest"] architecture: [x86-64, aarch64] exclude: # Cross-compiling x86_64 → aarch64 on Windows doesn't work; use windows-11-arm instead - os: windows-latest architecture: aarch64 include: # Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available) - os: windows-11-arm python-version: "3.11" architecture: aarch64 # TODO: re-enable once setup-python supports windows-11-arm + python 3.14t # (setup-python is currently broken with that combination) # - os: windows-11-arm # python-version: "3.14t" # architecture: aarch64 steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - name: Set Rust target id: target run: | TARGET=${{ (matrix.os == 'macos-14' && (matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin')) || (matrix.os == 'ubuntu-latest' && (matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || 'x86_64-unknown-linux-gnu')) || (matrix.os == 'windows-latest' && 'x86_64-pc-windows-msvc') || (matrix.os == 'windows-11-arm' && 'aarch64-pc-windows-msvc') }} echo "target=$TARGET" >> $GITHUB_OUTPUT - name: Set up Python uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (fast) uses: PyO3/maturin-action@v1 with: manylinux: auto command: build args: "-o dist -i python${{ matrix.python-version }}" target: ${{ steps.target.outputs.target }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist check-wheel-build-musllinux: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.14t"] architecture: [x86-64, aarch64] steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - name: Set up Python uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (fast) uses: PyO3/maturin-action@v1 with: manylinux: musllinux_1_2 command: build args: "-o dist -i python${{ matrix.python-version }}" target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist check-sdist-build: runs-on: "ubuntu-latest" steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - name: build sdist uses: PyO3/maturin-action@v1 with: manylinux: auto command: sdist args: "-o dist" - name: upload sdist uses: actions/upload-artifact@v7 with: name: sdist path: dist ================================================ FILE: .github/workflows/docs.yml ================================================ name: Docs on: push: branches: - main tags: - 'v*' workflow_dispatch: inputs: version_tag: description: 'Tag to build docs for (e.g. v0.18.0). Checks out the tag before building.' required: true mark_as_stable: description: 'Mark this version as the stable default (updates root redirect)' type: boolean default: false jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Checkout tag (workflow_dispatch) if: github.event_name == 'workflow_dispatch' env: VERSION_TAG: ${{ github.event.inputs.version_tag }} run: git checkout "refs/tags/$VERSION_TAG" - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.11" - name: Set up rust toolchain uses: dtolnay/rust-toolchain@stable - name: install uv uses: astral-sh/setup-uv@v7 - name: Determine version id: version env: INPUT_VERSION_TAG: ${{ github.event.inputs.version_tag }} INPUT_MARK_AS_STABLE: ${{ github.event.inputs.mark_as_stable }} run: | if [[ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]]; then echo "version=$INPUT_VERSION_TAG" >> "$GITHUB_OUTPUT" echo "is_stable=$INPUT_MARK_AS_STABLE" >> "$GITHUB_OUTPUT" elif [[ "${GITHUB_REF}" == refs/tags/v* ]]; then echo "version=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT" echo "is_stable=true" >> "$GITHUB_OUTPUT" else echo "version=latest" >> "$GITHUB_OUTPUT" echo "is_stable=false" >> "$GITHUB_OUTPUT" fi - name: Build docs env: VERSION: ${{ steps.version.outputs.version }} run: | make install make doc-versioned - name: Deploy to gh-pages env: VERSION: ${{ steps.version.outputs.version }} IS_STABLE: ${{ steps.version.outputs.is_stable }} run: | git config user.name github-actions git config user.email github-actions@github.com # Stash built docs cp -r "docs/$VERSION" /tmp/docs-build # Switch to gh-pages (gh-pages exists) git checkout gh-pages git merge -m 'Merge main' origin/main # Place versioned docs rm -rf "docs/$VERSION" mv /tmp/docs-build "docs/$VERSION" # Update versions.json and root redirect STABLE_FLAG="" if [[ "$IS_STABLE" == "true" ]]; then STABLE_FLAG="--stable" fi ./scripts/update_versions.py \ --version "$VERSION" \ --docs-dir docs \ $STABLE_FLAG git add -f docs git commit -m "Update docs ($VERSION)" --allow-empty git push origin gh-pages ================================================ FILE: .github/workflows/release.yml ================================================ name: Release on: push: # Sequence of patterns matched against refs/tags tags: - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 jobs: linux: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.14t"] architecture: [x86-64, aarch64] steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (release) uses: PyO3/maturin-action@v1 with: manylinux: auto command: build args: "--release -o dist -i python${{ matrix.python-version }}" target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || null }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-linux-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist linux-musl: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.10", "3.14t"] architecture: [x86-64, aarch64] steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (release) uses: PyO3/maturin-action@v1 with: manylinux: musllinux_1_2 command: build args: "--release -o dist -i python${{ matrix.python-version }}" target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist macos: runs-on: macos-14 strategy: matrix: python-version: ["3.10", "3.14t"] architecture: [x86-64, aarch64] steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (release) uses: PyO3/maturin-action@v1 with: command: build args: "--release -o dist -i python${{ matrix.python-version }}" target: ${{ matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin' }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-macos-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist windows: runs-on: ${{ matrix.os }} strategy: matrix: python-version: ["3.10", "3.14t"] os: [windows-latest] architecture: [x86-64] include: # Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available) - os: windows-11-arm python-version: "3.11" architecture: aarch64 # TODO: re-enable once setup-python supports windows-11-arm + python 3.14t # (setup-python is currently broken with that combination) # - os: windows-11-arm # python-version: "3.14t" # architecture: aarch64 steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: build (release) uses: PyO3/maturin-action@v1 with: command: build args: "--release -o dist -i python${{ matrix.python-version }}" target: ${{ matrix.architecture == 'aarch64' && 'aarch64-pc-windows-msvc' || 'x86_64-pc-windows-msvc' }} - name: Upload wheels uses: actions/upload-artifact@v7 with: name: "wheels-windows-python-${{ matrix.python-version }}-${{ matrix.architecture }}" path: dist sdist: runs-on: "ubuntu-latest" steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: actions/setup-python@v6 with: python-version: "3.10" - name: build (sdist) uses: PyO3/maturin-action@v1 with: manylinux: auto command: sdist args: "-o dist" - name: Upload sdist uses: actions/upload-artifact@v7 with: name: sdist path: dist # NOTE: Cannot use a matrix here, as we only want a single release release: name: Release runs-on: ubuntu-latest needs: [linux, linux-musl, macos, windows, sdist] permissions: id-token: write # Required for OIDC token exchange with crates.io contents: write # Required to be able to create a GitHub release steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - uses: rust-lang/crates-io-auth-action@v1 id: auth - name: Download Linux wheels uses: actions/download-artifact@v8 with: pattern: "wheels-linux-*" merge-multiple: true path: wheels-linux - name: Download MacOS wheels uses: actions/download-artifact@v8 with: pattern: "wheels-macos-*" merge-multiple: true path: wheels-macos - name: Download Windows wheels uses: actions/download-artifact@v8 with: pattern: "wheels-windows-*" merge-multiple: true path: wheels-windows - name: Download sdist uses: actions/download-artifact@v8 with: name: "sdist" path: sdist - name: Publish to PyPI uses: PyO3/maturin-action@v1 env: MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} with: command: upload args: "--skip-existing wheels-linux/*.whl wheels-macos/*.whl wheels-windows/*.whl sdist/*.tar.gz" - name: Publish to crates.io run: cargo publish env: CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }} - name: Release uses: softprops/action-gh-release@v3 with: generate_release_notes: true files: | wheels-linux/*.whl wheels-macos/*.whl wheels-windows/*.whl sdist/*.tar.gz ================================================ FILE: .gitignore ================================================ /target bigfile.* __pycache__ *.pyc *.so *.dat .DS_Store .python-version pyrightconfig.json .venv docs .vscode .idea .benchmarks notebooks /python/tests/fixtures/~$*.xlsx .zed dist ================================================ FILE: .pre-commit-config.yaml ================================================ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v3.2.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: http://github.com/doublify/pre-commit-rust rev: v1.0 hooks: - id: cargo-check - repo: local hooks: - id: lint name: Lint entry: make lint types_or: [python, rust] language: system pass_filenames: false - id: format name: Format entry: make format types_or: [python, rust] language: system pass_filenames: false ================================================ FILE: Cargo.toml ================================================ [package] name = "fastexcel" version = "0.20.2" description = "A fast excel reader for Rust and Python" rust-version = "1.85.0" edition = "2024" license = "MIT" homepage = "https://github.com/ToucanToco/fastexcel" repository = "https://github.com/ToucanToco/fastexcel.git" readme = "README.md" include = [ "/pyproject.toml", "/README.md", "/LICENSE", "/Makefile", "/src", "/python/fastexcel", "!__pycache__", "!*.pyc", "!*.so", ] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] name = "fastexcel" crate-type = ["cdylib", "rlib"] [dependencies] arrow-array = { version = "^58", features = ["ffi"], optional = true } arrow-pyarrow = { version = "^58", optional = true } arrow-schema = { version = "^58", optional = true } calamine = { version = "^0.35.0", features = ["chrono"] } chrono = { version = "^0.4.40", default-features = false } log = "^0.4" polars-core = { version = ">=0.53", features = [ "dtype-date", "dtype-datetime", "dtype-duration", ], optional = true } pyo3 = { version = "^0.28", features = ["abi3-py310"], optional = true } pyo3-arrow = { version = "^0.17", default-features = false, optional = true } pyo3-log = { version = "^0.13.3", optional = true } [dev-dependencies] anyhow = "1.0.102" pretty_assertions = { version = "^1.4.1", features = ["unstable"] } rstest = { version = "^0.26.1", default-features = false } # NOTE: This is a hack to bypass pyo3 limitations when testing: # https://pyo3.rs/v0.22.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror [features] default = [] __arrow = ["dep:arrow-schema", "dep:arrow-array"] python = ["__arrow", "dep:pyo3", "dep:pyo3-log", "dep:pyo3-arrow"] extension-module = ["pyo3/extension-module"] polars = ["dep:polars-core"] pyarrow = ["dep:arrow-pyarrow", "python"] # Private features for internal usage, should not be used directly as they may # change without notice __pyo3-tests = [ # feature for tests only. This makes Python::with_gil auto-initialize Python # interpreters, which allows us to instantiate Python objects in tests # (see https://pyo3.rs/v0.22.3/features#auto-initialize) "pyo3/auto-initialize", "pyarrow", ] __rust-tests-standalone = [] __rust-tests-polars = ["polars"] # Private feature for maturin usage, should not be used directly __maturin = ["extension-module", "pyarrow"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 ToucanToco Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ .DEFAULT_GOAL := all sources = python/fastexcel python/tests export CARGO_TERM_COLOR=$(shell (test -t 0 && echo always) || echo auto) .PHONY: .uv ## Check that uv is installed .uv: @uv -V || echo 'Please install uv: https://docs.astral.sh/uv/getting-started/installation/' .PHONY: install ## Install the package & dependencies with debug build install: .uv uv sync --frozen --group all uv run maturin develop --uv -E pyarrow,pandas,polars .PHONY: install-prod ## Install the package & dependencies with release build install-prod: .uv uv sync --frozen --group all uv run maturin develop --uv --release -E pyarrow,pandas,polars .PHONY: setup-dev ## First-time setup: install + pre-commit hooks setup-dev: install uv run pre-commit install --install-hooks .PHONY: rebuild-lockfiles ## Rebuild lockfiles from scratch, updating all dependencies rebuild-lockfiles: .uv uv lock --upgrade cargo update .PHONY: build-dev ## Build the development version of the package build-dev: uv run maturin build .PHONY: build-wheel ## Build production wheel and install it build-wheel: @rm -rf target/wheels/ uv run maturin build --release @wheel=$$(ls target/wheels/*.whl); uv pip install --force-reinstall "$$wheel[pandas,polars]" .PHONY: lint-python ## Lint python source files lint-python: uv run ruff check $(sources) uv run ruff format --check $(sources) uv run mypy $(sources) .PHONY: lint-rust ## Lint rust source files lint-rust: cargo fmt --all -- --check # Rust cargo clippy --tests -- -D warnings # Python-related code cargo clippy --features __maturin,__pyo3-tests --tests -- -D warnings # Rust+polars cargo clippy --features polars --tests -- -D warnings .PHONY: lint ## Lint rust and python source files lint: lint-python lint-rust .PHONY: format-python ## Auto-format python source files format-python: uv run ruff check --fix $(sources) uv run ruff format $(sources) .PHONY: format-rust ## Auto-format rust source files format-rust: cargo fmt --all cargo clippy --all-features --tests --fix --lib -p fastexcel --allow-dirty --allow-staged .PHONY: format ## Auto-format python and rust source files format: format-rust format-python .PHONY: test-python ## Run python tests test-python: install uv run pytest .PHONY: test-rust-pyo3 ## Run PyO3 rust tests test-rust-pyo3: # --lib to skip integration tests cargo test --no-default-features --features __pyo3-tests --lib .PHONY: test-rust-standalone ## Run standalone rust tests test-rust-standalone: cargo test --no-default-features --features __rust-tests-standalone .PHONY: test-rust-polars ## Run polars rust tests test-rust-polars: cargo test --no-default-features --features __rust-tests-polars .PHONY: test-rust ## Run rust tests test-rust: test-rust-pyo3 test-rust-standalone test-rust-polars .PHONY: test ## Run all tests test: test-rust test-python .PHONY: doc-serve ## Serve documentation with live reload doc-serve: build-dev uv run pdoc --template-directory doc-templates python/fastexcel .PHONY: doc ## Build documentation doc: build-dev uv run pdoc --template-directory doc-templates -o docs/latest python/fastexcel uv run scripts/update_versions.py --version latest --docs-dir docs cargo doc --no-deps --lib -p fastexcel --features polars .PHONY: doc-versioned ## Build versioned documentation (CI usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned) doc-versioned: build-dev @test -n "$(VERSION)" || (echo "ERROR: VERSION is not set. Usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned" && exit 1) uv run pdoc --template-directory doc-templates -o docs/$(VERSION) python/fastexcel uv run scripts/update_versions.py --version $(VERSION) --docs-dir docs $(if $(filter 1,$(STABLE)),--stable,) .PHONY: all ## Run the standard set of checks performed in CI all: format build-dev lint test .PHONY: benchmarks ## Run benchmarks benchmarks: build-wheel uv run pytest ./python/tests/benchmarks/speed.py .PHONY: clean ## Clear local caches and build artifacts clean: rm -rf `find . -name __pycache__` rm -f `find . -type f -name '*.py[co]' ` rm -f `find . -type f -name '*~' ` rm -f `find . -type f -name '.*~' ` rm -rf .cache rm -rf htmlcov rm -rf .pytest_cache rm -rf *.egg-info rm -f .coverage rm -f .coverage.* rm -rf build rm -rf perf.data* rm -rf python/fastexcel/*.so .PHONY: help ## Display this message help: @grep -E \ '^.PHONY: .*?## .*$$' $(MAKEFILE_LIST) | \ sort | \ awk 'BEGIN {FS = ".PHONY: |## "}; {printf "\033[36m%-19s\033[0m %s\n", $$2, $$3}' ================================================ FILE: README.md ================================================ # `fastexcel` A fast excel file reader for Python and Rust. Docs: * [Python](https://fastexcel.toucantoco.dev/). * [Rust](https://docs.rs/fastexcel). ## Stability The Python library is considered production-ready. The API is mostly stable, and we avoid breaking changes as much as possible. v1.0.0 will be released once the [milestone](https://github.com/ToucanToco/fastexcel/milestone/2) is reached. > ⚠️ The free-threaded build is still considered experimental The Rust crate is still experimental, and breaking changes are to be expected. ## Installation ```bash # Lightweight installation (no PyArrow dependency) pip install fastexcel # With Polars support only (no PyArrow needed) pip install fastexcel[polars] # With Pandas support (includes PyArrow) pip install fastexcel[pandas] # With PyArrow support pip install fastexcel[pyarrow] # With all integrations pip install fastexcel[pandas,polars] ``` ## Quick Start ### Modern usage (recommended) FastExcel supports the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for zero-copy data exchange with libraries like Polars, without requiring pyarrow as a dependency. Use fastexcel with any Arrow-compatible library without requiring pyarrow. ```python import fastexcel # Load an Excel file reader = fastexcel.read_excel("data.xlsx") sheet = reader.load_sheet(0) # Load first sheet # Use with Polars (zero-copy, no pyarrow needed) import polars as pl df = pl.DataFrame(sheet) # Direct PyCapsule interface print(df) # Or use the to_polars() method (also via PyCapsule) df = sheet.to_polars() print(df) # Or access the raw Arrow data via PyCapsule interface schema = sheet.__arrow_c_schema__() array_data = sheet.__arrow_c_array__() ``` ### Traditional usage (with pandas/pyarrow) ```python import fastexcel reader = fastexcel.read_excel("data.xlsx") sheet = reader.load_sheet(0) # Convert to pandas (requires `pandas` extra) df = sheet.to_pandas() # Or get pyarrow RecordBatch directly record_batch = sheet.to_arrow() ``` ### Working with tables ```python reader = fastexcel.read_excel("data.xlsx") # List available tables tables = reader.table_names() print(f"Available tables: {tables}") # Load a specific table table = reader.load_table("MyTable") df = pl.DataFrame(table) # Zero-copy via PyCapsule, no pyarrow needed ``` ## Key Features - **Zero-copy data exchange** via [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) - **Flexible dependencies** - use with Polars (no PyArrow needed) or Pandas (includes PyArrow) - **Seamless Polars integration** - `pl.DataFrame(sheet)` and `sheet.to_polars()` work without PyArrow via PyCapsule interface - **High performance** - written in Rust with [calamine](https://github.com/tafia/calamine) and [Apache Arrow](https://arrow.apache.org/) - **Memory efficient** - lazy loading and optional eager evaluation - **Type safety** - automatic type inference with manual override options ## Contributing & Development ### Prerequisites You'll need: 1. **[Rust](https://rustup.rs/)** - Rust stable or nightly 2. **[uv](https://docs.astral.sh/uv/getting-started/installation/)** - Fast Python package manager (will install Python 3.10+ automatically) 3. **[git](https://git-scm.com/)** - For version control 4. **[make](https://www.gnu.org/software/make/)** - For running development commands **Python Version Management:** uv handles Python installation automatically. To use a specific Python version: ```bash uv python install 3.13 # Install Python 3.13 uv python pin 3.13 # Pin project to Python 3.13 ``` ### Quick Start ```bash # Clone the repository (or from your fork) git clone https://github.com/ToucanToco/fastexcel.git cd fastexcel # First-time setup: install dependencies, build debug version, and setup pre-commit hooks make setup-dev ``` Verify your installation by running: ```bash make ``` This runs a full development cycle: formatting, building, linting, and testing ### Development Commands Run `make help` to see all available commands, or use these common ones: ```bash make all # full dev cycle: format, build, lint, test make install # install with debug build (daily development) make install-prod # install with release build (benchmarking) make test # to run the tests make lint # to run the linter make format # to format python and rust code make doc-serve # to serve the documentation locally ``` ### Useful Resources * [`python/fastexcel/_fastexcel.pyi`](./python/fastexcel/_fastexcel.pyi) - Python API types * [`python/tests/`](./python/tests) - Comprehensive usage examples ## Benchmarking For benchmarking, use `make benchmarks` which automatically builds an optimised wheel. This is required for profiling, as dev mode builds are much slower. ### Speed benchmarks ```bash make benchmarks ``` ### Memory profiling ```bash mprof run -T 0.01 python python/tests/benchmarks/memory.py python/tests/benchmarks/fixtures/plain_data.xls ``` ## Creating a release 1. Create a PR containing a commit that only updates the version in `Cargo.toml`. 2. Once it is approved, squash and merge it into main. 3. Tag the squashed commit, and push it. 4. The `release` GitHub action will take care of the rest. ## Dev tips * Use `cargo check` to verify that your rust code compiles, no need to go through `maturin` every time * `cargo clippy` = 💖 * Careful with arrow constructors, they tend to allocate a lot * [`mprof`](https://github.com/pythonprofilers/memory_profiler) and `time` go a long way for perf checks, no need to go fancy right from the start ================================================ FILE: doc-templates/module.html.jinja2 ================================================ {% extends "default/module.html.jinja2" %} {% block nav_title %} {{ super() }}
{% endblock %} ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["maturin>=1.7.0,<2.0"] build-backend = "maturin" [project] name = "fastexcel" description = "A fast excel file reader for Python, written in Rust" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Rust", "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Free Threading :: 1 - Unstable" ] dependencies = ["typing-extensions>=4.0.0; python_version<'3.10'"] dynamic = ["version"] [project.optional-dependencies] pyarrow = ["pyarrow>=8.0.0"] pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"] polars = ["polars>=1"] [dependency-groups] dev = ["maturin>=1.7.0,<2.0"] testing = [ { include-group = "dev" }, "pytest>=7.1.3", "pytest-benchmark>=4.0.0,<6", "pytest-mock>=3.1", "pyarrow>=8.0.0", "pandas>=1.4.4", "polars>=0.16.14", "openpyxl>=3.1.2,<4", "xlrd>=2.0.1,<3", ] linting = [ { include-group = "dev" }, "mypy>=2,<3", "pre-commit>=2.20.0,<5", "ruff>=0.15", ] docs = [{ include-group = "dev" }, "pdoc"] all = [ { include-group = "testing" }, { include-group = "linting" }, { include-group = "docs" }, ] [project.urls] "Source Code" = "https://github.com/ToucanToco/fastexcel" Issues = "https://github.com/ToucanToco/fastexcel" [tool.maturin] python-source = "python" module-name = "fastexcel._fastexcel" features = ["__maturin"] [tool.mypy] python_version = "3.10" follow_imports = "silent" ignore_missing_imports = true # A few custom options show_error_codes = true warn_no_return = true warn_unused_configs = true warn_unused_ignores = true [tool.pytest.ini_options] testpaths = "python/tests" log_cli = true log_cli_level = "INFO" [tool.ruff] line-length = 100 target-version = "py310" [tool.ruff.lint] # Enable Pyflakes `E` and `F` codes by default. select = ["E", "F", "I", "Q", "FA102", "UP"] [tool.uv] # this ensures that `uv run` doesn't actually build the package; a `make` # command is needed to build package = false required-version = '>=0.8.4' ================================================ FILE: python/fastexcel/__init__.py ================================================ from __future__ import annotations import typing from collections.abc import Callable from typing import TYPE_CHECKING, Literal, TypeAlias if TYPE_CHECKING: import pandas as pd import polars as pl import pyarrow as pa from os.path import expanduser from pathlib import Path try: import importlib.util importlib.util.find_spec("pyarrow") _PYARROW_AVAILABLE = True except ImportError: _PYARROW_AVAILABLE = False from ._fastexcel import ( ArrowError, CalamineCellError, CalamineError, CannotRetrieveCellDataError, CellError, CellErrors, ColumnInfo, ColumnInfoNoDtype, ColumnNotFoundError, DefinedName, FastExcelError, InvalidParametersError, SheetNotFoundError, UnsupportedColumnTypeCombinationError, __version__, _ExcelReader, _ExcelSheet, _ExcelTable, ) from ._fastexcel import read_excel as _read_excel DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] DTypeMap: TypeAlias = "dict[str | int, DType]" ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"] DTypeFrom: TypeAlias = Literal[ "provided_for_all", "provided_by_index", "provided_by_name", "guessed" ] SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"] class ExcelSheet: """A class representing a single sheet in an Excel File""" def __init__(self, sheet: _ExcelSheet) -> None: self._sheet = sheet @property def name(self) -> str: """The name of the sheet""" return self._sheet.name @property def width(self) -> int: """The sheet's width""" return self._sheet.width @property def height(self) -> int: """The sheet's height, with `skip_rows` and `nrows` applied""" return self._sheet.height @property def total_height(self) -> int: """The sheet's total height""" return self._sheet.total_height @property def selected_columns(self) -> list[ColumnInfo]: """The sheet's selected columns""" return self._sheet.selected_columns def available_columns(self) -> list[ColumnInfo]: """The columns available for the given sheet""" return self._sheet.available_columns() @property def specified_dtypes(self) -> DTypeMap | None: """The dtypes specified for the sheet""" return self._sheet.specified_dtypes @property def visible(self) -> SheetVisible: """The visibility of the sheet""" return self._sheet.visible def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch` Requires the `pyarrow` extra to be installed. """ if not _PYARROW_AVAILABLE: raise ImportError( "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" ) return self._sheet.to_arrow() def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]: """Converts the sheet to a pyarrow `RecordBatch` with error information. Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None. Requires the `pyarrow` extra to be installed. """ if not _PYARROW_AVAILABLE: raise ImportError( "pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501 ) rb, cell_errors = self._sheet.to_arrow_with_errors() if not cell_errors.errors: return (rb, None) return (rb, cell_errors) def to_pandas(self) -> pd.DataFrame: """Converts the sheet to a Pandas `DataFrame`. Requires the `pandas` extra to be installed. """ # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ # which we don't implement. Using pyarrow conversion for now. # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) return self.to_arrow().to_pandas() def to_polars(self) -> pl.DataFrame: """Converts the sheet to a Polars `DataFrame`. Uses the Arrow PyCapsule Interface for zero-copy data exchange. Requires the `polars` extra to be installed. """ import polars as pl return pl.DataFrame(self) def __arrow_c_schema__(self) -> object: """Export the schema as an `ArrowSchema` `PyCapsule`. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ return self._sheet.__arrow_c_schema__() def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. The optional `requested_schema` parameter allows for potential schema conversion. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ return self._sheet.__arrow_c_array__(requested_schema) def __repr__(self) -> str: return self._sheet.__repr__() class ExcelTable: """A class representing a single table in an Excel file""" def __init__(self, table: _ExcelTable) -> None: self._table = table @property def name(self) -> str: """The name of the table""" return self._table.name @property def sheet_name(self) -> str: """The name of the sheet this table belongs to""" return self._table.sheet_name @property def width(self) -> int: """The table's width""" return self._table.width @property def height(self) -> int: """The table's height""" return self._table.height @property def total_height(self) -> int: """The table's total height""" return self._table.total_height @property def offset(self) -> int: """The table's offset before data starts""" return self._table.offset @property def selected_columns(self) -> list[ColumnInfo]: """The table's selected columns""" return self._table.selected_columns def available_columns(self) -> list[ColumnInfo]: """The columns available for the given table""" return self._table.available_columns() @property def specified_dtypes(self) -> DTypeMap | None: """The dtypes specified for the table""" return self._table.specified_dtypes def to_arrow(self) -> pa.RecordBatch: """Converts the table to a pyarrow `RecordBatch` Requires the `pyarrow` extra to be installed. """ if not _PYARROW_AVAILABLE: raise ImportError( "pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'" ) return self._table.to_arrow() def to_pandas(self) -> pd.DataFrame: """Converts the table to a Pandas `DataFrame`. Requires the `pandas` extra to be installed. """ # Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__ # which we don't implement. Using pyarrow conversion for now. # (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html) return self.to_arrow().to_pandas() def to_polars(self) -> pl.DataFrame: """Converts the table to a Polars `DataFrame`. Uses the Arrow PyCapsule Interface for zero-copy data exchange. Requires the `polars` extra to be installed. """ import polars as pl return pl.DataFrame(self) def __arrow_c_schema__(self) -> object: """Export the schema as an `ArrowSchema` `PyCapsule`. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ return self._table.__arrow_c_schema__() def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]: """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. The optional `requested_schema` parameter allows for potential schema conversion. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ return self._table.__arrow_c_array__(requested_schema) class ExcelReader: """A class representing an open Excel file and allowing to read its sheets""" def __init__(self, reader: _ExcelReader) -> None: self._reader = reader @property def sheet_names(self) -> list[str]: """The list of sheet names""" return self._reader.sheet_names @typing.overload def load_sheet( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> ExcelSheet: ... @typing.overload def load_sheet( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> pa.RecordBatch: ... def load_sheet( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: bool = False, skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> ExcelSheet | pa.RecordBatch: """Loads a sheet by index or name. :param idx_or_name: The index (starting at 0) or the name of the sheet to load. :param header_row: The index of the row containing the column labels, default index is 0. If `None`, the sheet does not have any column labels. Any rows before the `header_row` will be automatically skipped. :param column_names: Overrides headers found in the document. If `column_names` is used, `header_row` will be ignored. :param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded :param skip_rows: Specifies which rows should be skipped after the `header_row`. Any rows before the `header_row` are automatically skipped. It means row indices are relative to data rows, not the sheet! Can be one of: - `int`: Skip this many rows after the header row - `list[int]`: Skip specific row indices (0-based relative to data rows) - `Callable[[int], bool]`: Function that receives row index (0-based relative to data rows) and returns True to skip the row - `None`: If `header_row` is None, skips empty rows at beginning :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the `dtypes` parameter. If `None`, all rows will be used. :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) will try to coerce different dtypes in a column to the same one, whereas `strict` will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via `dtypes`. :param use_columns: Specifies the columns to use. Can either be: - `None` to select all columns - A list of strings and ints, the column names and/or indices (starting at 0) - A string, a comma separated list of Excel column letters and column ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges (e.g. `"B:"` to select all columns from B onwards) and from-beginning ranges (e.g. `":C"` to select columns from A to C). These can be combined for "except" patterns (e.g. `":C,E:"` to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used :param dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names. :param eager: Specifies whether the sheet should be loaded eagerly. `False` (default) will load the sheet lazily using the `PyCapsule` interface, whereas `True` will load it eagerly via `pyarrow`. Eager loading requires the `pyarrow` extra to be installed. :param skip_whitespace_tail_rows: Skip rows at the end of the sheet containing only whitespace and null values. :param whitespace_as_null: Consider cells containing only whitespace as null values. """ sheet_or_rb = self._reader.load_sheet( idx_or_name=idx_or_name, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=eager, skip_whitespace_tail_rows=skip_whitespace_tail_rows, whitespace_as_null=whitespace_as_null, ) return sheet_or_rb if eager else ExcelSheet(sheet_or_rb) def table_names(self, sheet_name: str | None = None) -> list[str]: """The list of table names. Will return an empty list if no tables are found. :param sheet_name: If given, will limit the list to the given sheet, will be faster too. """ return self._reader.table_names(sheet_name) def defined_names(self) -> list[DefinedName]: """The list of defined names (named ranges) in the workbook. Returns a list of DefinedName objects with 'name' and 'formula' attributes. The formula is a string representation of the range or expression. Will return an empty list if no defined names are found. """ return self._reader.defined_names() @typing.overload def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> ExcelTable: ... @typing.overload def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> pa.RecordBatch: ... def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: bool = False, skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> ExcelTable | pa.RecordBatch: """Loads a table by name. :param name: The name of the table to load. :param header_row: The index of the row containing the column labels. If `None`, the table's column names will be used. Any rows before the `header_row` will be automatically skipped. :param column_names: Overrides headers found in the document. If `column_names` is used, `header_row` will be ignored. :param n_rows: Specifies how many rows should be loaded. If `None`, all rows are loaded :param skip_rows: Specifies how many rows should be skipped after the `header_row`. Any rows before the `header_row` are automatically skipped. If `header_row` is `None`, it skips the number of rows from the start of the sheet. :param schema_sample_rows: Specifies how many rows should be used to determine the dtype of a column. Cannot be 0. A specific dtype can be enforced for some or all columns through the `dtypes` parameter. If `None`, all rows will be used. :param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default) will try to coerce different dtypes in a column to the same one, whereas `strict` will raise an error in case a column contains several dtypes. Note that this only applies to columns whose dtype is guessed, i.e. not specified via `dtypes`. :param use_columns: Specifies the columns to use. Can either be: - `None` to select all columns - A list of strings and ints, the column names and/or indices (starting at 0) - A string, a comma separated list of Excel column letters and column ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in `A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges (e.g. `"B:"` to select all columns from B onwards) and from-beginning ranges (e.g. `":C"` to select columns from A to C). These can be combined for "except" patterns (e.g. `":C,E:"` to select everything except column D) - A callable, a function that takes a column and returns a boolean indicating whether the column should be used :param dtypes: An optional dtype (for all columns) or dict of dtypes with keys as column indices or names. :param eager: Specifies whether the table should be loaded eagerly. `False` (default) will load the table lazily using the `PyCapsule` interface, whereas `True` will load it eagerly via `pyarrow`. Eager loading requires the `pyarrow` extra to be installed. :param skip_whitespace_tail_rows: Skip rows at the end of the table containing only whitespace and null values. :param whitespace_as_null: Consider cells containing only whitespace as null values. """ if eager: return self._reader.load_table( name=name, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=True, skip_whitespace_tail_rows=skip_whitespace_tail_rows, whitespace_as_null=whitespace_as_null, ) else: return ExcelTable( self._reader.load_table( name=name, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=False, skip_whitespace_tail_rows=skip_whitespace_tail_rows, whitespace_as_null=whitespace_as_null, ) ) def load_sheet_eager( self, idx_or_name: int | str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | None = None, dtypes: DType | DTypeMap | None = None, ) -> pa.RecordBatch: """Loads a sheet eagerly by index or name. For xlsx files, this will be faster and more memory-efficient, as it will use `worksheet_range_ref` under the hood, which returns borrowed types. Refer to `load_sheet` for parameter documentation Requires the `pyarrow` extra to be installed. """ return self._reader.load_sheet( idx_or_name=idx_or_name, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, eager=True, ) def load_sheet_by_name( self, name: str, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by name. Refer to `load_sheet` for parameter documentation """ return self.load_sheet( name, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, ) def load_sheet_by_idx( self, idx: int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, ) -> ExcelSheet: """Loads a sheet by index. Refer to `load_sheet` for parameter documentation """ return self.load_sheet( idx, header_row=header_row, column_names=column_names, skip_rows=skip_rows, n_rows=n_rows, schema_sample_rows=schema_sample_rows, dtype_coercion=dtype_coercion, use_columns=use_columns, dtypes=dtypes, ) def __repr__(self) -> str: return self._reader.__repr__() def read_excel(source: Path | str | bytes) -> ExcelReader: """Opens and loads an excel file. :param source: The path to a file or its content as bytes """ if isinstance(source, str | Path): source = expanduser(source) return ExcelReader(_read_excel(source)) __all__ = ( # version "__version__", # main entrypoint "read_excel", # Python types "DType", "DTypeMap", # Excel reader "ExcelReader", # Excel sheet "ExcelSheet", # Excel table "ExcelTable", # Column metadata "DTypeFrom", "ColumnNameFrom", "ColumnInfo", # Defined names "DefinedName", # Parse error information "CellError", "CellErrors", # Exceptions "FastExcelError", "CannotRetrieveCellDataError", "CalamineCellError", "CalamineError", "SheetNotFoundError", "ColumnNotFoundError", "ArrowError", "InvalidParametersError", "UnsupportedColumnTypeCombinationError", ) ================================================ FILE: python/fastexcel/_fastexcel.pyi ================================================ from __future__ import annotations import typing from collections.abc import Callable from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: import pyarrow as pa DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"] DTypeMap = dict[str | int, DType] ColumnNameFrom = Literal["provided", "looked_up", "generated"] DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"] SheetVisible = Literal["visible", "hidden", "veryhidden"] class ColumnInfoNoDtype: def __init__( self, *, name: str, index: int, absolute_index: int, column_name_from: ColumnNameFrom, ) -> None: ... @property def name(self) -> str: ... @property def index(self) -> int: ... @property def absolute_index(self) -> int: ... @property def column_name_from(self) -> ColumnNameFrom: ... class ColumnInfo: def __init__( self, *, name: str, index: int, absolute_index: int, column_name_from: ColumnNameFrom, dtype: DType, dtype_from: DTypeFrom, ) -> None: ... @property def name(self) -> str: ... @property def index(self) -> int: ... @property def absolute_index(self) -> int: ... @property def dtype(self) -> DType: ... @property def column_name_from(self) -> ColumnNameFrom: ... @property def dtype_from(self) -> DTypeFrom: ... class DefinedName: def __init__( self, *, name: str, formula: str, ) -> None: ... @property def name(self) -> str: ... @property def formula(self) -> str: ... class CellError: @property def position(self) -> tuple[int, int]: ... @property def row_offset(self) -> int: ... @property def offset_position(self) -> tuple[int, int]: ... @property def detail(self) -> str: ... def __repr__(self) -> str: ... class CellErrors: @property def errors(self) -> list[CellError]: ... def __repr__(self) -> str: ... class _ExcelSheet: @property def name(self) -> str: """The name of the sheet""" @property def width(self) -> int: """The sheet's width""" @property def height(self) -> int: """The sheet's height""" @property def total_height(self) -> int: """The sheet's total height""" @property def offset(self) -> int: """The sheet's offset before data starts""" @property def selected_columns(self) -> list[ColumnInfo]: """The sheet's selected columns""" def available_columns(self) -> list[ColumnInfo]: """The columns available for the given sheet""" @property def specified_dtypes(self) -> DTypeMap | None: """The dtypes specified for the sheet""" @property def visible(self) -> SheetVisible: """The visibility of the sheet""" def to_arrow(self) -> pa.RecordBatch: """Converts the sheet to a pyarrow `RecordBatch` Requires the `pyarrow` extra to be installed. """ def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]: """Converts the sheet to a pyarrow `RecordBatch` with error information. Stores the positions of any values that cannot be parsed as the specified type and were therefore converted to None. Requires the `pyarrow` extra to be installed. """ def __arrow_c_schema__(self) -> object: """Export the schema as an `ArrowSchema` `PyCapsule`. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]: """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. The optional `requested_schema` parameter allows for potential schema conversion. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ class _ExcelTable: @property def name(self) -> str: """The name of the table""" @property def sheet_name(self) -> str: """The name of the sheet this table belongs to""" @property def width(self) -> int: """The table's width""" @property def height(self) -> int: """The table's height""" @property def total_height(self) -> int: """The table's total height""" @property def offset(self) -> int: """The table's offset before data starts""" @property def selected_columns(self) -> list[ColumnInfo]: """The table's selected columns""" def available_columns(self) -> list[ColumnInfo]: """The columns available for the given table""" @property def specified_dtypes(self) -> DTypeMap | None: """The dtypes specified for the table""" def to_arrow(self) -> pa.RecordBatch: """Converts the table to a pyarrow `RecordBatch` Requires the `pyarrow` extra to be installed. """ def __arrow_c_schema__(self) -> object: """Export the schema as an `ArrowSchema` `PyCapsule`. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]: """Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`. The optional `requested_schema` parameter allows for potential schema conversion. https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export The Arrow PyCapsule Interface enables zero-copy data exchange with Arrow-compatible libraries without requiring PyArrow as a dependency. """ class _ExcelReader: """A class representing an open Excel file and allowing to read its sheets""" @typing.overload def load_sheet( self, idx_or_name: str | int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> _ExcelSheet: ... @typing.overload def load_sheet( self, idx_or_name: str | int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> pa.RecordBatch: ... @typing.overload def load_sheet( self, idx_or_name: str | int, *, header_row: int | None = 0, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: bool = False, skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> pa.RecordBatch: ... @typing.overload def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[False] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> _ExcelTable: ... @typing.overload def load_table( self, name: str, *, header_row: int | None = None, column_names: list[str] | None = None, skip_rows: int | list[int] | Callable[[int], bool] | None = None, n_rows: int | None = None, schema_sample_rows: int | None = 1_000, dtype_coercion: Literal["coerce", "strict"] = "coerce", use_columns: list[str] | list[int] | str | Callable[[ColumnInfoNoDtype], bool] | None = None, dtypes: DType | DTypeMap | None = None, eager: Literal[True] = ..., skip_whitespace_tail_rows: bool = False, whitespace_as_null: bool = False, ) -> pa.RecordBatch: ... @property def sheet_names(self) -> list[str]: ... def table_names(self, sheet_name: str | None = None) -> list[str]: ... def defined_names(self) -> list[DefinedName]: ... def read_excel(source: str | bytes) -> _ExcelReader: """Reads an excel file and returns an ExcelReader""" __version__: str # Exceptions class FastExcelError(Exception): ... class UnsupportedColumnTypeCombinationError(FastExcelError): ... class CannotRetrieveCellDataError(FastExcelError): ... class CalamineCellError(FastExcelError): ... class CalamineError(FastExcelError): ... class SheetNotFoundError(FastExcelError): ... class ColumnNotFoundError(FastExcelError): ... class ArrowError(FastExcelError): ... class InvalidParametersError(FastExcelError): ... ================================================ FILE: python/fastexcel/py.typed ================================================ ================================================ FILE: python/tests/__init__.py ================================================ ================================================ FILE: python/tests/benchmarks/README.md ================================================ # Benchmarks These benchmarks were generated using `pytest-benchmark`. > **_NOTE:_** formulas.xlsx was found [here](https://foss.heptapod.net/openpyxl/openpyxl/-/issues/494) plain_data.xls and plain_data.xlsx can be found [here](https://public.opendatasoft.com/explore/dataset/covid-19-pandemic-worldwide-data/export/?disjunctive.zone&disjunctive.category) Using the following command: ```bash make benchmarks ``` The results are from my local machine. This is not 100% accurate. ## Speed ### 'xls': 2 tests |Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds|Iterations| |-----------------|---|---|----|------|------|---|-------|---|-------|----------| |test_fastexcel_xls|27.0991 (1.0)|33.7495 (1.0)|29.5819 (1.0)|1.6429 (1.0)|29.3559 (1.0)|2.7158 (1.0)|10;0|33.8044 (1.0)|29|1| |test_xlrd|596.5040 (22.01)|628.7964 (18.63)|612.5730 (20.71)|12.9967 (7.91)|615.1620 (20.96)|20.7911 (7.66)|2;0|1.6325 (0.05)|5|1| ### 'xlsx': 4 tests |Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds Iterations| |-----------------|---|---|----|------|------|---|--------|---|------------------| |test_fastexcel_xlsx|437.5810 (1.0)|470.7615 (1.0)|457.9611 (1.0)|13.7401 (1.0)|457.7006 (1.0)|21.0743 (1.25)|1;0|2.1836 (1.0)|5|1| |test_fastexcel_with_formulas|3,106.7454 (7.10)|3,150.2050 (6.69)|3,122.5234 (6.82)|16.6031 (1.21)|3,120.9000 (6.82)|16.8614 (1.0)|1;0 0.3203 (0.15)|5|1| |test_pyxl|4,780.2341 (10.92)|4,998.7753 (10.62)|4,899.6885 (10.70)|110.4665 (8.04)|4,948.7550 (10.81)|211.6149 (12.55)|2;0|0.2041 (0.09)|5|1| test_pyxl_with_formulas|25,312.8494 (57.85)|26,621.4687 (56.55)|25,808.5418 (56.36)|545.0540 (39.67)|25,748.0901 (56.26)|852.3171 (50.55)|1;0|0.0387 (0.02)|5|1| ## Memory usage | fastexcel memory usage | other memory usage | |-|-| |![fastexcel xls](memory_profiles/test_xls_fastexcel.png "fastexcel xls") |![xlrd xls](memory_profiles/test_xls_xlrd.png "xlrd xls")| |![fastexcel xlsx](memory_profiles/test_xlsx_fastexcel.png "fastexcel xlsx") |![pyxl xlsx](memory_profiles/test_xlsx_openpyxl.png "pyxl xlsx")| |![fastexcel formulas xlsx](memory_profiles/test_xlsx_formulas_fastexcel.png "fastexcel formulas xlsx") |![pyxl formulas xlsx](memory_profiles/test_xlsx_formulas_openpyxl.png "pyxl formulas xlsx")| ================================================ FILE: python/tests/benchmarks/fixtures/formulas.xlsx ================================================ [File too large to display: 46.5 MB] ================================================ FILE: python/tests/benchmarks/memory.py ================================================ import argparse from enum import Enum from .readers import fastexcel_read, pyxl_read, xlrd_read class Engine(str, Enum): FASTEXCEL = "fastexcel" XLRD = "xlrd" OPENPYXL = "pyxl" def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("-e", "--engine", default=Engine.FASTEXCEL) parser.add_argument("file") return parser.parse_args() def main(): args = get_args() engine = args.engine if engine == Engine.FASTEXCEL: fastexcel_read(args.file) elif engine == Engine.XLRD: xlrd_read(args.file) elif engine == Engine.OPENPYXL: pyxl_read(args.file) if __name__ == "__main__": main() ================================================ FILE: python/tests/benchmarks/readers.py ================================================ from fastexcel import read_excel from openpyxl import load_workbook from xlrd import open_workbook def pyxl_read(test_file_path: str): wb = load_workbook(test_file_path, read_only=True, keep_links=False, data_only=True) for ws in wb: rows = ws.iter_rows() rows = ws.values for row in rows: for _ in row: pass def xlrd_read(test_file_path: str): wb = open_workbook(test_file_path) for ws in wb.sheets(): for idx in range(ws.nrows): for _ in ws.row_values(idx): pass def fastexcel_read(test_file_path: str): reader = read_excel(test_file_path) for sheet_name in reader.sheet_names: sheet = reader.load_sheet_by_name(sheet_name) sheet.to_arrow() ================================================ FILE: python/tests/benchmarks/speed.py ================================================ """ Compare read performance with fastexcel, xlrd and different openpyxl options """ import pytest from .readers import fastexcel_read, pyxl_read, xlrd_read @pytest.fixture def plain_data_xls(): return "./python/tests/benchmarks/fixtures/plain_data.xls" @pytest.fixture def plain_data_xlsx(): return "./python/tests/benchmarks/fixtures/plain_data.xlsx" @pytest.fixture def formula_xlsx(): return "./python/tests/benchmarks/fixtures/formulas.xlsx" @pytest.mark.benchmark(group="xlsx") def test_pyxl(benchmark, plain_data_xlsx): benchmark(pyxl_read, plain_data_xlsx) @pytest.mark.benchmark(group="xls") def test_xlrd(benchmark, plain_data_xls): benchmark(xlrd_read, plain_data_xls) @pytest.mark.benchmark(group="xls") def test_fastexcel_xls(benchmark, plain_data_xls): benchmark(fastexcel_read, plain_data_xls) @pytest.mark.benchmark(group="xlsx") def test_fastexcel_xlsx(benchmark, plain_data_xlsx): benchmark(fastexcel_read, plain_data_xlsx) @pytest.mark.benchmark(group="xlsx") def test_pyxl_with_formulas(benchmark, formula_xlsx): benchmark(pyxl_read, formula_xlsx) @pytest.mark.benchmark(group="xlsx") def test_fastexcel_with_formulas(benchmark, formula_xlsx): benchmark(fastexcel_read, formula_xlsx) ================================================ FILE: python/tests/conftest.py ================================================ from __future__ import annotations from datetime import datetime from typing import Any import pytest @pytest.fixture def expected_data_sheet_null_strings() -> dict[str, list[Any]]: return { "FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], "SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"], "DATES_AND_NULLS": [ None, None, None, datetime(2022, 12, 19, 0, 0), datetime(2022, 8, 26, 0, 0), datetime(2023, 5, 6, 0, 0), datetime(2023, 3, 20, 0, 0), datetime(2022, 8, 29, 0, 0), None, None, ], "TIMESTAMPS_AND_NULLS": [ None, None, datetime(2023, 2, 18, 6, 13, 56, 730000), datetime(2022, 9, 20, 20, 0, 7, 50000), datetime(2022, 9, 24, 17, 4, 31, 236000), None, None, None, datetime(2022, 9, 14, 1, 50, 58, 390000), datetime(2022, 10, 21, 17, 20, 12, 223000), ], "INTS_AND_NULLS": [ 2076.0, 2285.0, 39323.0, None, None, None, 11953.0, None, 30192.0, None, ], "FLOATS_AND_NULLS": [ 141.02023312814603, 778.0655928608671, None, 497.60307287584106, 627.446112513911, None, None, None, 488.3509486743364, None, ], } ================================================ FILE: python/tests/test_alias_generation.py ================================================ from __future__ import annotations import fastexcel import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import path_for_fixture @pytest.mark.parametrize( "use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]] ) def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None: excel_reader = fastexcel.read_excel( path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx") ) sheet = excel_reader.load_sheet(0, use_columns=use_columns) assert [col.name for col in sheet.available_columns()] == ["col", "col_1", "col_2"] pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "col": [1.0, 2.0], "col_1": [2019.0, 2020.0], "col_2": pd.Series( [pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")] ).astype("datetime64[ms]"), } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "col": [1.0, 2.0], "col_1": [2019.0, 2020.0], "col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"], } ).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), ) ================================================ FILE: python/tests/test_column_selection.py ================================================ # ruff: noqa: E501 from __future__ import annotations import re from typing import Any import fastexcel import numpy as np import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import path_for_fixture @pytest.fixture def excel_reader_single_sheet() -> fastexcel.ExcelReader: return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) @pytest.fixture def expected_column_info() -> list[fastexcel.ColumnInfo]: return [ fastexcel.ColumnInfo( name="Month", index=0, absolute_index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed", ), fastexcel.ColumnInfo( name="Year", index=1, absolute_index=1, column_name_from="looked_up", dtype="float", dtype_from="guessed", ), ] def test_single_sheet_all_columns( excel_reader_single_sheet: fastexcel.ExcelReader, expected_column_info: list[fastexcel.ColumnInfo], ) -> None: sheet = excel_reader_single_sheet.load_sheet(0) sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None) assert sheet.selected_columns == expected_column_info assert sheet.available_columns() == expected_column_info expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} expected_pd_df = pd.DataFrame(expected) expected_pl_df = pl.DataFrame(expected) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) pd_df_explicit_arg = sheet_explicit_arg.to_pandas() pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df) pl_df = sheet.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pl_df_explicit_arg = sheet_explicit_arg.to_polars() pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df) def test_single_sheet_subset_by_str( excel_reader_single_sheet: fastexcel.ExcelReader, expected_column_info: list[fastexcel.ColumnInfo], ) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} # looks like mypy 1.8 became more stupid sheets: list[str | int] = [0, "January"] for sheet_name_or_idx in sheets: for idx, col in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col]) assert sheet.selected_columns == [expected_column_info[idx]] assert sheet.available_columns() == expected_column_info pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]})) pl_df = sheet.to_polars() pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]})) def test_single_sheet_subset_by_index( excel_reader_single_sheet: fastexcel.ExcelReader, expected_column_info: list[fastexcel.ColumnInfo], ) -> None: expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} sheets: list[str | int] = [0, "January"] for sheet_name_or_idx in sheets: for idx, col_name in enumerate(["Month", "Year"]): sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx]) assert sheet.selected_columns == [expected_column_info[idx]] assert sheet.available_columns() == expected_column_info pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]})) pl_df = sheet.to_polars() pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]})) @pytest.fixture def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader: return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) @pytest.fixture def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]: return { "col1": [2.0, 3.0], "__UNNAMED__1": [1.5, 2.5], "col3": ["hello", "world"], "__UNNAMED__3": [-5.0, -6.0], "col5": ["a", "b"], } @pytest.fixture def sheet_with_unnamed_columns_expected_column_info() -> list[fastexcel.ColumnInfo]: return [ fastexcel.ColumnInfo( name="col1", index=0, absolute_index=0, column_name_from="looked_up", dtype="float", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__1", index=1, absolute_index=1, column_name_from="generated", dtype="float", dtype_from="guessed", ), fastexcel.ColumnInfo( name="col3", index=2, absolute_index=2, column_name_from="looked_up", dtype="string", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__3", index=3, absolute_index=3, column_name_from="generated", dtype="float", dtype_from="guessed", ), fastexcel.ColumnInfo( name="col5", index=4, absolute_index=4, column_name_from="looked_up", dtype="string", dtype_from="guessed", ), ] def test_single_sheet_with_unnamed_columns( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = ["col1", "col3", "__UNNAMED__3"] use_columns_idx = [0, 2, 3] expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == [ sheet_with_unnamed_columns_expected_column_info[0], sheet_with_unnamed_columns_expected_column_info[2], sheet_with_unnamed_columns_expected_column_info[3], ] assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx ) assert sheet.selected_columns == [ sheet_with_unnamed_columns_expected_column_info[0], sheet_with_unnamed_columns_expected_column_info[2], sheet_with_unnamed_columns_expected_column_info[3], ] assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_pagination( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = ["col1", "col3", "__UNNAMED__3"] use_columns_idx = [0, 2, 3] # first row only expected = { k: v[:1] for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, n_rows=1 ) assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, n_rows=1 ) assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) # second row expected = { k: v[1:] for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1 ) assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1 ) assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: use_columns_str = ["col0", "col2", "col3"] use_columns_idx = [0, 2, 3] expected: dict[str, list[Any]] = { "col0": [2.0, 3.0], "col1": ["hello", "world"], "col2": [-5.0, -6.0], } column_names = [f"col{i}" for i in range(3)] expected_columns_names = ["col0", "__UNNAMED__1", "col1", "col2", "__UNNAMED__4"] # skipping the header row only with pytest.raises( fastexcel.InvalidParametersError, match='use_columns can only contain integers when used with columns_names, got "col0"', ): excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names, ) sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names ) assert [col.name for col in sheet.available_columns()] == expected_columns_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) # skipping the header row + first data row expected_first_row_skipped = {k: v[1:] for k, v in expected.items()} sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names ) assert [col.name for col in sheet.available_columns()] == expected_columns_names pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped)) def test_single_sheet_with_unnamed_columns_and_str_range( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: use_columns_str = "A,C:E" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1", "col3", "__UNNAMED__3", "col5"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == ( sheet_with_unnamed_columns_expected_column_info[:1] + sheet_with_unnamed_columns_expected_column_info[2:] ) assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_open_ended_range( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test B: (should get columns B, C, D, E - indices 1, 2, 3, 4) use_columns_str = "B:" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["__UNNAMED__1", "col3", "__UNNAMED__3", "col5"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[1:] assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_open_ended_range_from_start( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test A: (should get all columns) use_columns_str = "A:" expected = single_sheet_with_unnamed_columns_expected sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_mixed_open_ended_range( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test A,C: (should get column A and columns from C onwards - indices 0, 2, 3, 4) use_columns_str = "A,C:" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1", "col3", "__UNNAMED__3", "col5"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) expected_selected_cols = [ sheet_with_unnamed_columns_expected_column_info[0] ] + sheet_with_unnamed_columns_expected_column_info[2:] assert sheet.selected_columns == expected_selected_cols assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_from_beginning_range( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test :C (should get columns A, B, C - indices 0, 1, 2) use_columns_str = ":C" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1", "__UNNAMED__1", "col3"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[:3] assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_from_beginning_range_single_column( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test :A (should get only column A - index 0) use_columns_str = ":A" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) assert sheet.selected_columns == [sheet_with_unnamed_columns_expected_column_info[0]] assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_with_unnamed_columns_and_complex_mixed_pattern( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, single_sheet_with_unnamed_columns_expected: dict[str, list[Any]], sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo], ) -> None: # Test A,:B,D,E: (should get A, A,B again (deduplicated), D, and E) # This effectively becomes A,B,D,E (columns 0,1,3,4) use_columns_str = "A,:B,D,E:" expected = { k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1", "__UNNAMED__1", "__UNNAMED__3", "col5"] } sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet( "With unnamed columns", use_columns=use_columns_str ) # Expected: columns A, A,B (from :B), D, E (from E:) # After deduplication: 0,1,3,4 expected_selected_cols = [ sheet_with_unnamed_columns_expected_column_info[0], # A sheet_with_unnamed_columns_expected_column_info[1], # B sheet_with_unnamed_columns_expected_column_info[3], # D sheet_with_unnamed_columns_expected_column_info[4], # E ] assert sheet.selected_columns == expected_selected_cols assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_single_sheet_invalid_column_indices_negative_integer( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: expected_message = """invalid parameters: expected list[int] | list[str], got [-2] Context: 0: could not determine selected columns from provided object: [-2] 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2]) def test_single_sheet_invalid_column_indices_empty_list( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: expected_message = """invalid parameters: list of selected columns is empty Context: 0: could not determine selected columns from provided object: [] 1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([]) """ with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[]) def test_single_sheet_invalid_column_indices_column_does_not_exist_str( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: expected_message = """column with name \"nope\" not found Context: 0: available columns are: .* """ with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"]) def test_single_sheet_invalid_column_indices_column_does_not_exist_int( excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader, ) -> None: expected_message = """column at index 42 not found Context: 0: available columns are: .* """ with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message): excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42]) def test_use_columns_with_column_names() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) sheet = excel_reader.load_sheet( 0, use_columns=[1, 2], header_row=None, skip_rows=1, column_names=["bools_renamed", "dates_renamed"], ) assert sheet.available_columns() == [ fastexcel.ColumnInfo( name="__UNNAMED__0", column_name_from="generated", index=0, absolute_index=0, dtype="float", dtype_from="guessed", ), fastexcel.ColumnInfo( name="bools_renamed", index=1, absolute_index=1, dtype="boolean", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="dates_renamed", index=2, absolute_index=2, dtype="datetime", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="__UNNAMED__3", index=3, absolute_index=3, dtype="float", dtype_from="guessed", column_name_from="generated", ), ] pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "bools_renamed": [True, False, True], "dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( "datetime64[ms]" ), } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "bools_renamed": [True, False, True], "dates_renamed": ["2022-03-02 05:43:04"] * 3, } ).with_columns( pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) def test_use_columns_with_callable() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) sheet = excel_reader.load_sheet(2) assert ( [(c.name, c.dtype) for c in sheet.available_columns()] == [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("col1", "float"), ("__UNNAMED__1", "float"), ("col3", "string"), ("__UNNAMED__3", "float"), ("col5", "string"), ] ) sheet = excel_reader.load_sheet( 2, use_columns=lambda col: col.name.startswith("col"), ) assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("col1", "float"), ("col3", "string"), ("col5", "string"), ] sheet = excel_reader.load_sheet( 2, use_columns=lambda col: col.index % 2 == 1, ) assert [(c.name, c.dtype) for c in sheet.selected_columns] == [ ("__UNNAMED__1", "float"), ("__UNNAMED__3", "float"), ] def test_use_columns_with_bad_callable() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) with pytest.raises( fastexcel.InvalidParametersError, match=re.escape("`use_columns` callable could not be called (TypeError: "), ): excel_reader.load_sheet( 2, use_columns=lambda: True, # type: ignore ) with pytest.raises( fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean" ): excel_reader.load_sheet( 2, use_columns=lambda _: 42, # type: ignore ) def test_use_columns_with_eager_loading() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) expected_months = [1.0, 2.0] expected_years = [2019.0, 2020.0] # default rb = excel_reader.load_sheet_eager(0) assert rb.schema.names == ["Month", "Year"] assert rb["Year"].tolist() == expected_years assert rb["Month"].tolist() == expected_months # changing order rb = excel_reader.load_sheet_eager(0, use_columns=["Year", "Month"]) assert rb.schema.names == ["Year", "Month"] assert rb["Year"].tolist() == expected_years assert rb["Month"].tolist() == expected_months # subset rb = excel_reader.load_sheet_eager(0, use_columns=["Year"]) assert rb.schema.names == ["Year"] assert rb["Year"].tolist() == expected_years assert "Month" not in (field.name for field in rb.schema) @pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"]) def test_use_columns_dtypes_eager_loading( excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]] ) -> None: expected_pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns( pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"), pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"), ) expected_pd_df = pd.DataFrame(expected_data_sheet_null_strings) expected_pd_df["DATES_AND_NULLS"] = expected_pd_df["DATES_AND_NULLS"].dt.as_unit("ms") expected_pd_df["TIMESTAMPS_AND_NULLS"] = expected_pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms") for use_columns in ( list(expected_data_sheet_null_strings.keys()), [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2], [key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2 == 0], list(reversed(expected_data_sheet_null_strings.keys())), [ key for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys())) if idx % 2 ], [ key for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys())) if idx % 2 == 0 ], ): excel_reader = fastexcel.read_excel(path_for_fixture(excel_file)) sheet = excel_reader.load_sheet_eager(0, use_columns=use_columns) pd_df = sheet.to_pandas() pl_df = pl.from_arrow(data=sheet) assert isinstance(pl_df, pl.DataFrame) sheet_lazy = excel_reader.load_sheet(0, use_columns=use_columns) pl_df_lazy = sheet_lazy.to_polars() pd_df_lazy = sheet_lazy.to_pandas() pl_assert_frame_equal(pl_df_lazy, pl_df) pd_assert_frame_equal(pd_df_lazy, pd_df) pl_assert_frame_equal(expected_pl_df.select(use_columns), pl_df) pd_assert_frame_equal(expected_pd_df[use_columns], pd_df) assert pd_df.columns.to_list() == use_columns assert pl_df.columns == use_columns def test_use_columns_with_table() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table = excel_reader.load_table("users", use_columns=["User Id", "FirstName"]) expected_available_columns = [ fastexcel.ColumnInfo( name="User Id", index=0, absolute_index=0, dtype="float", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="FirstName", index=1, absolute_index=1, dtype="string", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__2", index=2, absolute_index=2, dtype="string", column_name_from="generated", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__3", index=3, absolute_index=3, dtype="datetime", column_name_from="generated", dtype_from="guessed", ), ] expected_selected_columns = [ fastexcel.ColumnInfo( name="User Id", index=0, absolute_index=0, dtype="float", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="FirstName", index=1, absolute_index=1, dtype="string", column_name_from="provided", dtype_from="guessed", ), ] assert table.available_columns() == expected_available_columns assert table.selected_columns == expected_selected_columns expected_pl_df = pl.DataFrame( {"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]} ) expected_pd_df = pd.DataFrame( {"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]} ) pl_df = table.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pd_df = table.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) def test_use_columns_with_table_and_provided_columns() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table = excel_reader.load_table( "users", use_columns=[0, 2], column_names=["user_id", "last_name"] ) expected_available_columns = [ fastexcel.ColumnInfo( name="user_id", index=0, absolute_index=0, dtype="float", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__1", index=1, absolute_index=1, dtype="string", column_name_from="generated", dtype_from="guessed", ), fastexcel.ColumnInfo( name="last_name", index=2, absolute_index=2, dtype="string", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="__UNNAMED__3", index=3, absolute_index=3, dtype="datetime", column_name_from="generated", dtype_from="guessed", ), ] expected_selected_columns = [ fastexcel.ColumnInfo( name="user_id", index=0, absolute_index=0, dtype="float", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="last_name", index=2, absolute_index=2, dtype="string", column_name_from="provided", dtype_from="guessed", ), ] assert table.available_columns() == expected_available_columns assert table.selected_columns == expected_selected_columns expected_pl_df = pl.DataFrame( {"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]} ) expected_pd_df = pd.DataFrame( {"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]} ) pl_df = table.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pd_df = table.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) def test_use_column_range_with_offset_without_table() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) sheet = excel_reader.load_sheet("without-table", use_columns="H:I", header_row=9) expected_pl_df = pl.DataFrame( { "Column at H10": [1.0, 2.0, 3.0], "Column at I10": [4.0, 5.0, 6.0], } ) expected_pd_df = pd.DataFrame( { "Column at H10": [1.0, 2.0, 3.0], "Column at I10": [4.0, 5.0, 6.0], } ) pl_df = sheet.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) def test_use_column_range_with_offset_with_table() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) sheet = excel_reader.load_sheet("with-table", use_columns="D:E", header_row=4) expected_pl_df = pl.DataFrame( { "Column at D5": [1.0, 2.0, 3.0, 4.0], "Column at E5": [4.0, 5.0, 6.0, 8.0], } ) expected_pd_df = pd.DataFrame( { "Column at D5": [1.0, 2.0, 3.0, 4.0], "Column at E5": [4.0, 5.0, 6.0, 8.0], } ) pl_df = sheet.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) def test_use_column_names_with_offset_table_by_index_and_name() -> None: """Index-based selection should resolve correctly when used with an offset table. The selected indices should be absolute, and it should be able to handle both index-based and name-based selection. """ excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) # Mix name-based and index-based selection # "Column at D5" is at table index 0, absolute index 3 # Index 4 is absolute index for column E table = excel_reader.load_table("TableAtD5", use_columns=["Column at D5", 4]) # type:ignore[arg-type] expected_selected_columns = [ fastexcel.ColumnInfo( name="Column at D5", index=0, absolute_index=3, dtype="float", column_name_from="provided", dtype_from="guessed", ), fastexcel.ColumnInfo( name="Column at E5", index=1, absolute_index=4, dtype="float", column_name_from="provided", dtype_from="guessed", ), ] assert table.selected_columns == expected_selected_columns expected_pl_df = pl.DataFrame( { "Column at D5": [1.0, 2.0, 3.0, 4.0], "Column at E5": [4.0, 5.0, 6.0, 8.0], } ) expected_pd_df = pd.DataFrame( { "Column at D5": [1.0, 2.0, 3.0, 4.0], "Column at E5": [4.0, 5.0, 6.0, 8.0], } ) pl_df = table.to_polars() pl_assert_frame_equal(pl_df, expected_pl_df) pd_df = table.to_pandas() pd_assert_frame_equal(pd_df, expected_pd_df) def test_use_column_range_with_offset_with_table_and_specified_dtypes() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) table_closed = excel_reader.load_table( "TableAtD5", use_columns="D:E", dtypes={3: "int", "Column at E5": "string"} ) table_open_ended = excel_reader.load_table( "TableAtD5", use_columns="D:", dtypes={3: "int", "Column at E5": "string"} ) expected_data = { # Dtype should be int, looked up by index "Column at D5": [1, 2, 3, 4], # Dtype should be string, looked up by name "Column at E5": ["4", "5", "6", "8"], } expected_column_info = [ fastexcel.ColumnInfo( name="Column at D5", index=0, absolute_index=3, dtype="int", dtype_from="provided_by_index", column_name_from="provided", ), fastexcel.ColumnInfo( name="Column at E5", index=1, absolute_index=4, dtype="string", dtype_from="provided_by_name", column_name_from="provided", ), ] assert table_closed.selected_columns == expected_column_info assert table_open_ended.selected_columns == expected_column_info expected_pl_df = pl.DataFrame(expected_data) expected_pd_df = pd.DataFrame(expected_data) pl_df_closed = table_closed.to_polars() pl_assert_frame_equal(pl_df_closed, expected_pl_df) pl_df_open_ended = table_open_ended.to_polars() pl_assert_frame_equal(pl_df_open_ended, expected_pl_df) pd_df_closed = table_closed.to_pandas() pd_assert_frame_equal(pd_df_closed, expected_pd_df) pd_df_open_ended = table_open_ended.to_pandas() pd_assert_frame_equal(pd_df_open_ended, expected_pd_df) def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) sheet_closed = excel_reader.load_sheet( "without-table", use_columns="H:K", header_row=9, dtypes={7: "int", "Column at I10": "string"}, ) sheet_open_ended = excel_reader.load_sheet( "without-table", use_columns="H:", header_row=9, dtypes={7: "int", "Column at I10": "string"}, ) expected_data_polars = { # Dtype should be int, looked up by index "Column at H10": [1, 2, 3], # Dtype should be string, looked up by name "Column at I10": ["4", "5", "6"], "__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String), "Column at K10": [7.0, 8.0, 9.0], } # In pandas 3, string columns use nan instead of None for missing values pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2]) na_value = np.nan if pd_version >= (3, 0) else None expected_data_pandas = { # Dtype should be int, looked up by index "Column at H10": [1, 2, 3], # Dtype should be string, looked up by name "Column at I10": ["4", "5", "6"], "__UNNAMED__2": [na_value, na_value, na_value], "Column at K10": [7.0, 8.0, 9.0], } expected_column_info = [ fastexcel.ColumnInfo( name="Column at H10", index=0, absolute_index=7, dtype="int", dtype_from="provided_by_index", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Column at I10", index=1, absolute_index=8, dtype="string", dtype_from="provided_by_name", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="__UNNAMED__2", index=2, absolute_index=9, dtype="string", dtype_from="guessed", column_name_from="generated", ), fastexcel.ColumnInfo( name="Column at K10", index=3, absolute_index=10, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), ] assert sheet_closed.selected_columns == expected_column_info assert sheet_open_ended.selected_columns == expected_column_info expected_pl_df = pl.DataFrame(expected_data_polars) expected_pd_df = pd.DataFrame(expected_data_pandas) pl_df_closed = sheet_closed.to_polars() pl_assert_frame_equal(pl_df_closed, expected_pl_df) pl_df_open_ended = sheet_open_ended.to_polars() pl_assert_frame_equal(pl_df_open_ended, expected_pl_df) pd_df_closed = sheet_closed.to_pandas() pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False) pd_df_open_ended = sheet_open_ended.to_pandas() pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False) ================================================ FILE: python/tests/test_defined_names.py ================================================ import fastexcel import pytest from .utils import path_for_fixture @pytest.mark.parametrize("path", ("sheet-with-defined-names.xlsx",)) def test_defined_names(path: str) -> None: excel_reader = fastexcel.read_excel(path_for_fixture(path)) defined_names = excel_reader.defined_names() expected_defined_names = [ fastexcel.DefinedName(name="AddingValues", formula="SUM(sheet1!$K$5:$K$6)"), fastexcel.DefinedName(name="DefinedRange", formula="sheet1!$A$5:$D$7"), fastexcel.DefinedName(name="NamedConstant", formula="3.4"), ] assert defined_names == expected_defined_names ================================================ FILE: python/tests/test_dtypes.py ================================================ from __future__ import annotations import logging from datetime import date, datetime from typing import Any, Literal import fastexcel import numpy as np import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import get_expected_pandas_dtype, path_for_fixture @pytest.fixture def expected_data() -> dict[str, list[Any]]: return { "Employee ID": [ "123456", "44333", "44333", "87878", "87878", "US00011", "135967", "IN86868", "IN86868", ], "Employee Name": [ "Test1", "Test2", "Test2", "Test3", "Test3", "Test4", "Test5", "Test6", "Test6", ], "Date": [datetime(2023, 7, 21)] * 9, "Details": ["Healthcare"] * 7 + ["Something"] * 2, "Asset ID": ["84444"] * 7 + ["ABC123"] * 2, "Mixed dates": ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3, "Mixed bools": ["true"] * 5 + ["false"] * 3 + ["other"], } def test_sheet_with_mixed_dtypes(expected_data: dict[str, list[Any]]) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = excel_reader.load_sheet(0) pd_df = sheet.to_pandas() pd_assert_frame_equal(pd_df, pd.DataFrame(expected_data).astype({"Date": "datetime64[ms]"})) pl_df = sheet.to_polars() pl_assert_frame_equal( pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")}) ) def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[Any]]) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) # Since we skip rows here, the dtypes should be correctly guessed, even if we only check 5 rows sheet = excel_reader.load_sheet(0, schema_sample_rows=5, skip_rows=5) expected_data_subset = {col_name: values[5:] for col_name, values in expected_data.items()} pd_df = sheet.to_pandas() pd_assert_frame_equal( pd_df, pd.DataFrame(expected_data_subset).astype({"Date": "datetime64[ms]"}) ) pl_df = sheet.to_polars() pl_assert_frame_equal( pl_df, pl.DataFrame(expected_data_subset, schema_overrides={"Date": pl.Datetime(time_unit="ms")}), ) # Guess the sheet's dtypes on 5 rows only sheet = excel_reader.load_sheet(0, schema_sample_rows=5) # String fields should not have been loaded expected_data["Employee ID"] = [ 123456.0, 44333.0, 44333.0, 87878.0, 87878.0, None, 135967.0, None, None, ] expected_data["Asset ID"] = [84444.0] * 7 + [None] * 2 expected_data["Mixed dates"] = [datetime(2023, 7, 21)] * 6 + [None] * 3 expected_data["Mixed bools"] = [True] * 5 + [False] * 3 + [None] pd_df = sheet.to_pandas() pd_assert_frame_equal( pd_df, pd.DataFrame(expected_data).astype( { "Date": "datetime64[ms]", "Mixed dates": "datetime64[ms]", } ), ) pl_df = sheet.to_polars() pl_assert_frame_equal( pl_df, pl.DataFrame( expected_data, schema_overrides={ "Date": pl.Datetime(time_unit="ms"), "Mixed dates": pl.Datetime(time_unit="ms"), }, ), ) @pytest.mark.parametrize("dtype_by_index", (True, False)) @pytest.mark.parametrize( "dtype,expected_data,expected_pl_dtype", [ ("int", [123456, 44333, 44333, 87878, 87878], pl.Int64), ("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64), ("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8), ("boolean", [True] * 5, pl.Boolean), ( "datetime", [datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2, pl.Datetime, ), ( "date", [date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2, pl.Date, ), # conversion to duration not supported yet ("duration", [pd.NaT] * 5, pl.Duration), ], ) def test_sheet_with_mixed_dtypes_specify_dtypes( dtype_by_index: bool, dtype: fastexcel.DType, expected_data: list[Any], expected_pl_dtype: pl.DataType, ) -> None: dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype} excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5) assert sheet.specified_dtypes == dtypes pd_df = sheet.to_pandas() expected_pd_dtype = get_expected_pandas_dtype(dtype) assert pd_df["Employee ID"].dtype == expected_pd_dtype assert pd_df["Employee ID"].to_list() == expected_data pl_df = sheet.to_polars() assert pl_df["Employee ID"].dtype == expected_pl_dtype assert pl_df["Employee ID"].to_list() == (expected_data if dtype != "duration" else [None] * 5) @pytest.mark.parametrize( "dtypes,expected,fastexcel_dtype,expected_pl_dtype", [ (None, datetime(2023, 7, 21), "datetime", pl.Datetime), ({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime), ({"Date": "date"}, date(2023, 7, 21), "date", pl.Date), ({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8), ({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime), ({2: "date"}, date(2023, 7, 21), "date", pl.Date), ({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8), ], ) def test_sheet_datetime_conversion( dtypes: fastexcel.DTypeMap | None, expected: Any, fastexcel_dtype: str, expected_pl_dtype: pl.DataType, ) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = excel_reader.load_sheet(0, dtypes=dtypes) assert sheet.specified_dtypes == dtypes pd_df = sheet.to_pandas() expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype) assert pd_df["Date"].dtype == expected_pd_dtype assert pd_df["Date"].to_list() == [expected] * 9 pl_df = sheet.to_polars() assert pl_df["Date"].dtype == expected_pl_dtype assert pl_df["Date"].to_list() == [expected] * 9 @pytest.mark.parametrize("eager", [True, False]) @pytest.mark.parametrize("dtype_coercion", ["coerce", None]) def test_dtype_coercion_behavior__coerce( dtype_coercion: Literal["coerce"] | None, eager: bool ) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {} sheet_or_rb = ( excel_reader.load_sheet(0, eager=eager, **kwargs) # type:ignore[call-overload] ) rb = sheet_or_rb if eager else sheet_or_rb.to_arrow() pd_df = rb.to_pandas() expected_pd_dtype = get_expected_pandas_dtype("string") assert pd_df["Mixed dates"].dtype == expected_pd_dtype assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3 pl_df = pl.from_arrow(data=rb) assert isinstance(pl_df, pl.DataFrame) assert pl_df["Mixed dates"].dtype == pl.Utf8 assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3 @pytest.mark.parametrize("eager", [True, False]) def test_dtype_coercion_behavior__strict_sampling_eveything(eager: bool) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) with pytest.raises( fastexcel.UnsupportedColumnTypeCombinationError, match="type coercion is strict" ): if eager: excel_reader.load_sheet_eager(0, dtype_coercion="strict") else: excel_reader.load_sheet(0, dtype_coercion="strict").to_arrow() @pytest.mark.parametrize("eager", [True, False]) def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = ( excel_reader.load_sheet_eager(0, dtype_coercion="strict", schema_sample_rows=5) if eager else excel_reader.load_sheet(0, dtype_coercion="strict", schema_sample_rows=5).to_arrow() ) pd_df = sheet.to_pandas() assert pd_df["Mixed dates"].dtype == "datetime64[ms]" assert ( pd_df["Mixed dates"].to_list() == [pd.Timestamp("2023-07-21 00:00:00")] * 6 + [pd.NaT] * 3 ) assert pd_df["Asset ID"].dtype == "float64" assert pd_df["Asset ID"].replace(np.nan, None).to_list() == [84444.0] * 7 + [None] * 2 pl_df = pl.from_arrow(data=sheet) assert isinstance(pl_df, pl.DataFrame) assert pl_df["Mixed dates"].dtype == pl.Datetime assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3 assert pl_df["Asset ID"].dtype == pl.Float64 assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2 def test_one_dtype_for_all() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx")) sheet = excel_reader.load_sheet(0, dtypes="string") assert sheet.available_columns() == [ fastexcel.ColumnInfo( name="Employee ID", index=0, absolute_index=0, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Employee Name", index=1, absolute_index=1, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Date", index=2, absolute_index=2, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Details", index=3, absolute_index=3, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Asset ID", index=4, absolute_index=4, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Mixed dates", index=5, absolute_index=5, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Mixed bools", index=6, absolute_index=6, dtype="string", dtype_from="provided_for_all", column_name_from="looked_up", ), ] assert sheet.to_polars().dtypes == [pl.String] * 7 def test_fallback_infer_dtypes(caplog: pytest.LogCaptureFixture) -> None: """it should fallback to string if it can't infer the dtype""" excel_reader = fastexcel.read_excel(path_for_fixture("infer-dtypes-fallback.xlsx")) sheet = excel_reader.load_sheet(0) # Ensure a warning message was logged to explain the fallback to string assert caplog.record_tuples == [ ( "fastexcel.types.dtype", logging.WARNING, "Could not determine dtype for column 1, falling back to string", ) ] assert sheet.available_columns() == [ fastexcel.ColumnInfo( name="id", index=0, absolute_index=0, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="label", index=1, absolute_index=1, dtype="string", dtype_from="guessed", column_name_from="looked_up", ), ] assert sheet.to_polars().dtypes == [pl.Float64, pl.String] @pytest.mark.parametrize( ("dtype", "expected_data"), [ ( "int", [None] * 2 + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 1.0, None, 1.0, 0.0] + [None] * 7 + [0.0], ), ( "float", [None] * 2 + [-1.0, 0.0, 1.0, 0.0, 1.0, 1.1, -1.0, 0.0, 1.0, 1.1, 1.0, 0.0] + [None] * 7 + [0.1], ), ( "string", [ None, "foo", "-1", "0", "1", "0", "1", "1.1", "-1", "0", "1", "1.1", "true", "false", "2023-07-21 00:00:00", "2023-07-21 12:20:00", # calamine reads a time as datetimes here, which seems wrong "1899-12-31 12:20:00", "07/21/2023", "7/21/2023 12:20:00 PM", "July 23rd", "12:20:00", "0.1", ], ), ( "boolean", [None] * 2 + [True, False, True, False, True, True] + [None] * 4 + [True, False] + [None] * 7 + [True], ), ( "datetime", [pd.NaT] * 2 + [ pd.Timestamp("1899-12-30 00:00:00"), pd.Timestamp("1899-12-31 00:00:00"), pd.Timestamp("1900-01-01 00:00:00"), pd.Timestamp("1899-12-31 00:00:00"), pd.Timestamp("1900-01-01 00:00:00"), pd.Timestamp("1900-01-01 02:24:00"), ] + [pd.NaT] * 6 + [ pd.Timestamp("2023-7-21 00:00:00"), pd.Timestamp("2023-7-21 12:20:00"), # calamine currently adds a date to a time, which is # questionable pd.Timestamp("1899-12-31 12:20:00"), ] + [pd.NaT] * 4 + [ # calamine converts percentages to datetimes (since it does not # distinguish from floats), which seems questionable pd.Timestamp("1899-12-31 02:24:00") ], ), ( "date", [None] * 2 + [ pd.Timestamp("1899-12-30").date(), pd.Timestamp("1899-12-31").date(), pd.Timestamp("1900-01-01").date(), pd.Timestamp("1899-12-31").date(), pd.Timestamp("1900-01-01").date(), pd.Timestamp("1900-01-01").date(), ] + [None] * 6 + [ pd.Timestamp("2023-7-21").date(), pd.Timestamp("2023-7-21").date(), # calamine converts any time to 1899-12-31, which is # questionable pd.Timestamp("1899-12-31").date(), ] + [None] * 4 + [ # calamine converts percentages to dates (since it does not # distinguish from floats), which seems questionable pd.Timestamp("1899-12-31").date() ], ), ( "duration", [pd.NaT] * 14 + [ # dates/datetimes are converted to durations, which seems # questionable pd.Timedelta(datetime(2023, 7, 21 + 1) - datetime(1899, 12, 31)), pd.Timedelta(datetime(2023, 7, 21 + 1, 12, 20, 0) - datetime(1899, 12, 31)), pd.Timedelta(hours=12, minutes=20), ] + [pd.NaT] * 5, ), ], ) def test_to_arrow_with_errors( dtype: fastexcel.DType, expected_data: list[Any], ): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx")) rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors() pd_df = rb.to_pandas() # For string columns in pandas 3, replace pd.NA with None for comparison if dtype == "string": column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list() else: column_values = pd_df["Column"].replace(np.nan, None).to_list() assert column_values == expected_data def item_to_polars(item: Any): if isinstance(item, pd.Timestamp): return item.to_pydatetime() if pd.isna(item): return None return item pl_df = pl.from_arrow(rb) assert isinstance(pl_df, pl.DataFrame) pl_expected_data = list(map(item_to_polars, expected_data)) assert pl_df["Column"].to_list() == pl_expected_data # the only empty cell is (0, 0), so all other cells that were read as None # should be errors expected_error_positions = [ (i, 0) for i in range(1, len(expected_data)) if expected_data[i] in {None, pd.NaT} ] if expected_error_positions: assert cell_errors is not None error_positions = [err.offset_position for err in cell_errors.errors] assert error_positions == expected_error_positions def test_guess_dtypes_with_div0_error() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("div0.xlsx")) sheet = excel_reader.load_sheet(0) assert sheet.available_columns() == [ fastexcel.ColumnInfo( name="dividend", index=0, absolute_index=0, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="divisor", index=1, absolute_index=1, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="quotient", index=2, absolute_index=2, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), ] expected_data = { "dividend": [42.0, 43.0, 44.0, 45.0], "divisor": [0.0, 1.0, 2.0, 3.0], "quotient": [None, 43.0, 22.0, 15.0], } pd_df = sheet.to_pandas() pd_expected_data = pd.DataFrame(expected_data) pd_assert_frame_equal(pd_df, pd_expected_data) pl_df = sheet.to_polars() pl_expected_data = pl.DataFrame(expected_data) pl_assert_frame_equal(pl_df, pl_expected_data) ================================================ FILE: python/tests/test_durations.py ================================================ from __future__ import annotations from datetime import date, datetime, timedelta import fastexcel import numpy as np import pandas as pd import polars as pl from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.datatypes import DataType as PolarsDataType from polars.datatypes import Date as PlDate from polars.datatypes import Datetime as PlDateTime from polars.datatypes import Duration as PlDuration from polars.datatypes import Utf8 as PlUtf8 from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import get_expected_pandas_dtype, path_for_fixture def test_sheet_with_different_time_types() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("dates.ods")) sheet = excel_reader.load_sheet_by_idx(0) pd_df = sheet.to_pandas() pl_df = sheet.to_polars() ## dtypes assert pd_df["date"].dtype == np.dtype("object") assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string") assert pd_df["time"].dtype == np.dtype("timedelta64[ms]") assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]") expected_pl_dtypes: dict[str, PolarsDataType] = { "date": PlDate(), "datestr": PlUtf8(), "time": PlDuration(time_unit="ms"), "datetime": PlDateTime(time_unit="ms", time_zone=None), } assert dict(zip(pl_df.columns, pl_df.dtypes)) == expected_pl_dtypes ## Contents expected_pd = pd.DataFrame( { "date": [date(2023, 6, 1)], "datestr": ["2023-06-01T02:03:04+02:00"], "time": pd.Series([pd.to_timedelta("01:02:03")]).astype("timedelta64[ms]"), "datetime": pd.Series([pd.to_datetime("2023-06-01 02:03:04")]).astype("datetime64[ms]"), } ) expected_pl = pl.DataFrame( { "date": [date(2023, 6, 1)], "datestr": ["2023-06-01T02:03:04+02:00"], "time": [timedelta(hours=1, minutes=2, seconds=3)], "datetime": [datetime(2023, 6, 1, 2, 3, 4)], }, schema=expected_pl_dtypes, ) pd_assert_frame_equal(pd_df, expected_pd) pl_assert_frame_equal(pl_df, expected_pl) def test_sheet_with_offset_header_row_and_durations() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("single-sheet-skip-rows-durations.xlsx")) sheet = excel_reader.load_sheet(0, header_row=10) pd_df = sheet.to_pandas() pl_df = sheet.to_polars() assert pd_df["Tot. Time Away From System"].dtype == np.dtype("timedelta64[ms]") assert pd_df["Tot. Time Away From System"].tolist() == [ pd.Timedelta("01:18:43"), pd.Timedelta("07:16:51"), ] assert pl_df["Tot. Time Away From System"].dtype == pl.Duration(time_unit="ms") assert pl_df["Tot. Time Away From System"].to_list() == [ timedelta(hours=1, minutes=18, seconds=43), timedelta(hours=7, minutes=16, seconds=51), ] ================================================ FILE: python/tests/test_eagerness.py ================================================ from datetime import date, datetime, timedelta import fastexcel import polars as pl from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from pyarrow import RecordBatch from .utils import path_for_fixture def test_load_sheet_eager_single_sheet() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) eager_pandas = excel_reader.load_sheet_eager(0).to_pandas() lazy_pandas = excel_reader.load_sheet(0).to_pandas() pd_assert_frame_equal(eager_pandas, lazy_pandas) eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0)) assert isinstance(eager_polars, pl.DataFrame) lazy_polars = excel_reader.load_sheet(0).to_polars() pl_assert_frame_equal(eager_polars, lazy_polars) def test_multiple_sheets_with_unnamed_columns(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas() lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas() pd_assert_frame_equal(eager_pandas, lazy_pandas) eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns")) assert isinstance(eager_polars, pl.DataFrame) lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars() pl_assert_frame_equal(eager_polars, lazy_polars) def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None: ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods")) record_batch = ods_reader.load_sheet_eager(0) assert isinstance(record_batch, RecordBatch) pl_df = pl.from_arrow(record_batch) assert isinstance(pl_df, pl.DataFrame) pl_assert_frame_equal( pl_df, pl.DataFrame( { "date": [date(2023, 6, 1)], "datestr": ["2023-06-01T02:03:04+02:00"], "time": [timedelta(hours=1, minutes=2, seconds=3)], "datetime": [datetime(2023, 6, 1, 2, 3, 4)], } ).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))), ) ================================================ FILE: python/tests/test_empty.py ================================================ import fastexcel import pytest from .utils import path_for_fixture @pytest.mark.parametrize("path", ("empty.ods", "empty.xlsx")) def test_empty(path: str) -> None: excel_reader = fastexcel.read_excel(path_for_fixture(path)) sheet = excel_reader.load_sheet_by_idx(0) assert sheet.to_pandas().empty assert sheet.to_polars().is_empty() ================================================ FILE: python/tests/test_errors.py ================================================ from __future__ import annotations import fastexcel import pytest from .utils import path_for_fixture def test_cell_error_repr() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx")) _, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": "int"}).to_arrow_with_errors() assert cell_errors is not None assert ( repr(cell_errors.errors[0]) == """CellError(position=(2, 0), offset_position=(1, 0), row_offset=1, detail="Expected int but got 'String(\\"foo\\")'")""" # noqa: E501 ) def test_read_excel_bad_type() -> None: expected_message = "source must be a string or bytes" with pytest.raises(fastexcel.InvalidParametersError, match=expected_message): fastexcel.read_excel(42) # type: ignore[arg-type] def test_does_not_exist() -> None: expected_message = """calamine error: Cannot detect file format Context: 0: Could not open workbook at path_does_not_exist.nope 1: could not load excel file at path_does_not_exist.nope""" with pytest.raises(fastexcel.CalamineError, match=expected_message) as exc_info: fastexcel.read_excel("path_does_not_exist.nope") assert exc_info.value.__doc__ == "Generic calamine error" # Should also work with the base error type with pytest.raises(fastexcel.FastExcelError, match=expected_message): fastexcel.read_excel("path_does_not_exist.nope") def test_sheet_idx_not_found_error() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) expected_message = """sheet at index 42 not found Context: 0: Sheet index 42 is out of range. File has 1 sheets.""" with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info: excel_reader.load_sheet(42) assert exc_info.value.__doc__ == "Sheet was not found" # Should also work with the base error type with pytest.raises(fastexcel.FastExcelError, match=expected_message): excel_reader.load_sheet(42) def test_sheet_name_not_found_error() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) expected_message = """sheet with name "idontexist" not found Context: 0: Sheet "idontexist" not found in file. Available sheets: "January".""" with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info: excel_reader.load_sheet("idontexist") assert exc_info.value.__doc__ == "Sheet was not found" @pytest.mark.parametrize( "exc_class, expected_docstring", [ (fastexcel.FastExcelError, "The base class for all fastexcel errors"), ( fastexcel.UnsupportedColumnTypeCombinationError, "Column contains an unsupported type combination", ), (fastexcel.CannotRetrieveCellDataError, "Data for a given cell cannot be retrieved"), ( fastexcel.CalamineCellError, "calamine returned an error regarding the content of the cell", ), (fastexcel.CalamineError, "Generic calamine error"), (fastexcel.ColumnNotFoundError, "Column was not found"), (fastexcel.SheetNotFoundError, "Sheet was not found"), (fastexcel.ArrowError, "Generic arrow error"), (fastexcel.InvalidParametersError, "Provided parameters are invalid"), ], ) def test_docstrings(exc_class: type[Exception], expected_docstring: str) -> None: assert exc_class.__doc__ == expected_docstring def test_schema_sample_rows_must_be_nonzero() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) with pytest.raises( fastexcel.InvalidParametersError, match="schema_sample_rows cannot be 0, as it would prevent dtype inferring", ): excel_reader.load_sheet(0, schema_sample_rows=0) with pytest.raises( fastexcel.InvalidParametersError, match="schema_sample_rows cannot be 0, as it would prevent dtype inferring", ): excel_reader.load_table("my-table", schema_sample_rows=0) ================================================ FILE: python/tests/test_fastexcel.py ================================================ from __future__ import annotations from datetime import datetime from typing import Any import fastexcel import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import path_for_fixture def test_single_sheet(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) assert excel_reader.sheet_names == ["January"] sheet_by_name = excel_reader.load_sheet("January") sheet_by_idx = excel_reader.load_sheet(0) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "January" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_single_sheet_bytes(): with open(path_for_fixture("fixture-single-sheet.xlsx"), "rb") as f: excel_reader = fastexcel.read_excel(f.read()) assert excel_reader.sheet_names == ["January"] sheet_by_name = excel_reader.load_sheet("January") sheet_by_idx = excel_reader.load_sheet(0) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "January" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_single_sheet_with_types(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] sheet = excel_reader.load_sheet(0) assert sheet.name == "Sheet1" assert sheet.height == sheet.total_height == 3 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "__UNNAMED__0": [0.0, 1.0, 2.0], "bools": [True, False, True], "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( "datetime64[ms]" ), "floats": [12.35, 42.69, 1234567], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "__UNNAMED__0": [0.0, 1.0, 2.0], "bools": [True, False, True], "dates": ["2022-03-02 05:43:04"] * 3, "floats": [12.35, 42.69, 1234567], } ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), ) def test_multiple_sheets(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"] pd_assert_frame_equal( excel_reader.load_sheet_by_idx(0).to_pandas(), pd.DataFrame({"Month": [1.0], "Year": [2019.0]}), ) pd_assert_frame_equal( excel_reader.load_sheet_by_idx(1).to_pandas(), pd.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}), ) pd_assert_frame_equal( excel_reader.load_sheet_by_name("With unnamed columns").to_pandas(), pd.DataFrame( { "col1": [2.0, 3.0], "__UNNAMED__1": [1.5, 2.5], "col3": ["hello", "world"], "__UNNAMED__3": [-5.0, -6.0], "col5": ["a", "b"], } ), ) pl_assert_frame_equal( excel_reader.load_sheet_by_idx(0).to_polars(), pl.DataFrame({"Month": [1.0], "Year": [2019.0]}), ) pl_assert_frame_equal( excel_reader.load_sheet_by_idx(1).to_polars(), pl.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}), ) pl_assert_frame_equal( excel_reader.load_sheet_by_name("With unnamed columns").to_polars(), pl.DataFrame( { "col1": [2.0, 3.0], "__UNNAMED__1": [1.5, 2.5], "col3": ["hello", "world"], "__UNNAMED__3": [-5.0, -6.0], "col5": ["a", "b"], } ), ) def test_sheets_with_header_line_diff_from_zero(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"] sheet_by_name = excel_reader.load_sheet("Sheet1", header_row=1) sheet_by_idx = excel_reader.load_sheet(0, header_row=1) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "Sheet1" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_no_header(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"] sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None) sheet_by_idx = excel_reader.load_sheet(1, header_row=None) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "Sheet2" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 expected = { "__UNNAMED__0": [1.0, 2.0], "__UNNAMED__1": [3.0, 4.0], "__UNNAMED__2": [5.0, 6.0], } pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_empty_rows_before_header(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"] sheet_by_name = excel_reader.load_sheet("Sheet3") sheet_by_idx = excel_reader.load_sheet(2) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "Sheet3" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 2 expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]} pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_custom_headers(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"] sheet_by_name = excel_reader.load_sheet( "Sheet2", header_row=None, column_names=["foo", "bar", "baz"] ) sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["foo", "bar", "baz"]) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "Sheet2" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 expected = {"foo": [1.0, 2.0], "bar": [3.0, 4.0], "baz": [5.0, 6.0]} pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheets_with_skipping_headers(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"] sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None, column_names=["Bugs"]) sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["Bugs"]) # Metadata assert sheet_by_name.name == sheet_by_idx.name == "Sheet2" assert sheet_by_name.height == sheet_by_idx.height == 2 assert sheet_by_name.width == sheet_by_idx.width == 3 expected = { "Bugs": [1.0, 2.0], "__UNNAMED__1": [3.0, 4.0], "__UNNAMED__2": [5.0, 6.0], } pd_expected = pd.DataFrame(expected) pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected) pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected) pl_expected = pl.DataFrame(expected) pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected) pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected) def test_sheet_with_pagination(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] sheet = excel_reader.load_sheet(0, skip_rows=1, n_rows=1) assert sheet.name == "Sheet1" assert sheet.height == 1 assert sheet.total_height == 3 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "__UNNAMED__0": [1.0], "bools": [False], "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"), "floats": [42.69], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "__UNNAMED__0": [1.0], "bools": [False], "dates": ["2022-03-02 05:43:04"], "floats": [42.69], } ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), ) def test_sheet_with_skip_rows(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] sheet = excel_reader.load_sheet(0, skip_rows=1) assert sheet.name == "Sheet1" assert sheet.height == 2 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "__UNNAMED__0": [1.0, 2.0], "bools": [False, True], "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 2).astype( "datetime64[ms]" ), "floats": [42.69, 1234567], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "__UNNAMED__0": [1.0, 2.0], "bools": [False, True], "dates": ["2022-03-02 05:43:04"] * 2, "floats": [42.69, 1234567], } ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), ) def test_sheet_with_n_rows(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] sheet = excel_reader.load_sheet(0, n_rows=1) assert sheet.name == "Sheet1" assert sheet.height == 1 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "__UNNAMED__0": [0.0], "bools": [True], "dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"), "floats": [12.35], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "__UNNAMED__0": [0.0], "bools": [True], "dates": ["2022-03-02 05:43:04"], "floats": [12.35], } ).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")), ) def test_sheet_with_pagination_and_without_headers(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] sheet = excel_reader.load_sheet( 0, n_rows=1, skip_rows=1, header_row=None, column_names=["This", "Is", "Amazing", "Stuff"], ) assert sheet.name == "Sheet1" assert sheet.height == 1 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "This": [0.0], "Is": [True], "Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype( "datetime64[ms]" ), "Stuff": [12.35], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "This": [0.0], "Is": [True], "Amazing": ["2022-03-02 05:43:04"], "Stuff": [12.35], } ).with_columns( pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) def test_sheet_with_pagination_out_of_bound(): excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) assert excel_reader.sheet_names == ["Sheet1"] with pytest.raises( fastexcel.InvalidParametersError, match="Too many rows skipped. Max height is 4" ): excel_reader.load_sheet( 0, skip_rows=1000000, header_row=None, column_names=["This", "Is", "Amazing", "Stuff"], ) sheet = excel_reader.load_sheet( 0, n_rows=1000000, skip_rows=1, header_row=None, column_names=["This", "Is", "Amazing", "Stuff"], ) assert sheet.name == "Sheet1" assert sheet.height == 3 assert sheet.width == 4 pd_assert_frame_equal( sheet.to_pandas(), pd.DataFrame( { "This": [0.0, 1.0, 2.0], "Is": [True, False, True], "Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype( "datetime64[ms]" ), "Stuff": [12.35, 42.69, 1234567], } ), ) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame( { "This": [0.0, 1.0, 2.0], "Is": [True, False, True], "Amazing": ["2022-03-02 05:43:04"] * 3, "Stuff": [12.35, 42.69, 1234567], } ).with_columns( pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms") ), ) def test_sheet_with_na(): """Test reading a sheet with #N/A cells. For now, we consider them as null""" excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx")) sheet = excel_reader.load_sheet(0) assert sheet.name == "Sheet1" assert sheet.height == sheet.total_height == 2 assert sheet.width == 2 expected = { "Title": ["A", "B"], "Amount": [None, 100.0], } pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_sheet_with_ref(): """Test reading a sheet with #REF! cells. For now, we consider them as null""" excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx")) sheet = excel_reader.load_sheet("Broken refs") assert sheet.name == "Broken refs" assert sheet.height == sheet.total_height == 2 assert sheet.width == 1 expected = {"numbers": [1.0, None]} pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) @pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"]) def test_null_strings(excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]): excel_reader = fastexcel.read_excel(path_for_fixture(excel_file)) sheet = excel_reader.load_sheet(0) assert sheet.height == sheet.total_height == 10 assert sheet.width == 6 pd_df = pd.DataFrame(expected_data_sheet_null_strings) pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms") pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms") pd_assert_frame_equal(sheet.to_pandas(), pd_df) pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns( pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"), pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"), ) pl_assert_frame_equal(sheet.to_polars(), pl_df) def test_null_values_in_cells() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value.xlsx")) sheet = excel_reader.load_sheet(0) expected = { "Title": ["A", "B", "C", "D"], "Date": [None, None, datetime(2021, 1, 1), datetime(2021, 5, 5)], } pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame(expected).with_columns(pl.col("Date").dt.cast_time_unit("ms")), ) pd_expected = pd.DataFrame(expected) pd_expected["Date"] = pd_expected["Date"].dt.as_unit("ms") pd_assert_frame_equal(sheet.to_pandas(), pd_expected) def test_invalid_value_num() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value-num.xlsx")) sheet = excel_reader.load_sheet(0) expected = {"Column": [8.0, None]} pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected)) pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected)) def test_null_column_is_nullable() -> None: sheet = fastexcel.read_excel(path_for_fixture("null-column.xlsx")).load_sheet(0) assert sheet.to_arrow().schema.field("nullonly").nullable is True def test_sheet_with_decimal_numbers() -> None: sheet = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet(0) pl_assert_frame_equal( sheet.to_polars(), pl.DataFrame({"Decimals": [28.14, 29.02]}), ) sheet2 = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet( 0, dtypes={0: "string"} ) pl_assert_frame_equal( sheet2.to_polars(), pl.DataFrame({"Decimals": ["28.14", "29.02"]}), ) @pytest.mark.parametrize( "header_row, skip_rows, expected", [ (0, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), # default ( None, 0, { "__UNNAMED__0": [None, None, "a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [None, None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), ( None, None, { "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), ( 0, 0, { "__UNNAMED__0": [None, "a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), ( 0, 1, { "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), ( None, 2, { "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), ( None, 3, {"__UNNAMED__0": ["b", "c", "d", "e", "f"], "__UNNAMED__1": [1.0, 2.0, 3.0, 4.0, 5.0]}, ), ( 1, 0, { "__UNNAMED__0": ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0], }, ), (2, 0, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), (2, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), (2, 1, {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}), (2, [1, 3], {"a": ["b", "d", "f"], "0": [1.0, 3.0, 5.0]}), (2, [0], {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}), ( None, [2, 4], { "__UNNAMED__0": [None, None, "b", "d", "e", "f"], "__UNNAMED__1": [None, None, 1.0, 3.0, 4.0, 5.0], }, ), (2, [], {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), (2, [0, 1, 2, 3], {"a": ["f"], "0": [5.0]}), (2, lambda x: x % 2 == 0, {"a": ["c", "e"], "0": [2.0, 4.0]}), (2, lambda x: x in [0, 4], {"a": ["c", "d", "e"], "0": [2.0, 3.0, 4.0]}), (2, lambda x: False, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), (2, lambda x: x != 2, {"a": ["d"], "0": [3.0]}), ], ) def test_header_row_and_skip_rows( header_row: int | None, skip_rows: int, expected: dict[str, Any] ) -> None: pl_assert_frame_equal( fastexcel.read_excel(path_for_fixture("no-header.xlsx")) .load_sheet(0, header_row=header_row, skip_rows=skip_rows) .to_polars(), pl.DataFrame(expected), ) def test_null_bytes_in_column_names() -> None: """https://github.com/ToucanToco/fastexcel/issues/343""" reader = fastexcel.read_excel(path_for_fixture("null-bytes-in-columns-names.xls")) df = reader.load_sheet(0).to_polars() assert df.shape == (8_763, 11) ================================================ FILE: python/tests/test_pycapsule.py ================================================ """Tests for the Arrow PyCapsule Interface implementation.""" import fastexcel import pandas as pd import polars as pl from .utils import path_for_fixture def test_sheet_arrow_c_schema(): """Test that __arrow_c_schema__ returns a valid PyCapsule.""" excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") schema_capsule = sheet.__arrow_c_schema__() # Check it's a PyCapsule with the correct name assert hasattr(schema_capsule, "__class__") assert "PyCapsule" in str(type(schema_capsule)) def test_sheet_arrow_c_array(): """Test that __arrow_c_array__ returns a tuple of PyCapsules.""" excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") schema_capsule, array_capsule = sheet.__arrow_c_array__() # Check both are PyCapsules assert "PyCapsule" in str(type(schema_capsule)) assert "PyCapsule" in str(type(array_capsule)) def test_table_arrow_c_schema(): """Test that table __arrow_c_schema__ returns a valid PyCapsule.""" excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table_names = excel_reader.table_names() table = excel_reader.load_table(table_names[0]) # Should be 'users' schema_capsule = table.__arrow_c_schema__() # Check it's a PyCapsule assert "PyCapsule" in str(type(schema_capsule)) def test_table_arrow_c_array(): """Test that table __arrow_c_array__ returns a tuple of PyCapsules.""" excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table_names = excel_reader.table_names() table = excel_reader.load_table(table_names[0]) # Should be 'users' schema_capsule, array_capsule = table.__arrow_c_array__() # Check both are PyCapsules assert "PyCapsule" in str(type(schema_capsule)) assert "PyCapsule" in str(type(array_capsule)) def test_pycapsule_interface_with_requested_schema(): """Test PyCapsule interface methods with requested_schema parameter.""" excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") # Test with None (current implementation ignores this) schema_capsule, array_capsule = sheet.__arrow_c_array__(None) assert "PyCapsule" in str(type(schema_capsule)) assert "PyCapsule" in str(type(array_capsule)) def test_integration_with_polars(): """Test that polars can consume our PyCapsule interface.""" excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") # Polars should be able to create a DataFrame from our PyCapsule interface # This tests the actual interoperability df = pl.DataFrame(sheet) assert len(df) == 2 assert df.columns == ["Month", "Year"] def test_to_polars_without_pyarrow(): """Test that to_polars() works via PyCapsule interface without pyarrow.""" excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") # This should work via PyCapsule interface, not requiring pyarrow df = sheet.to_polars() assert isinstance(df, pl.DataFrame) assert len(df) == 2 assert df.columns == ["Month", "Year"] # Test with table as well excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table_names = excel_reader_table.table_names() table = excel_reader_table.load_table(table_names[0]) df_table = table.to_polars() assert isinstance(df_table, pl.DataFrame) def test_to_pandas_still_requires_pyarrow(): """Test that to_pandas() currently still requires pyarrow. Note: pandas PyCapsule interface would require implementing __dataframe__ or __arrow_c_stream__, which we don't currently do. """ excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) sheet = excel_reader.load_sheet("January") # This still requires pyarrow for now df = sheet.to_pandas() assert isinstance(df, pd.DataFrame) assert len(df) == 2 assert list(df.columns) == ["Month", "Year"] # Test with table as well excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx")) table_names = excel_reader_table.table_names() table = excel_reader_table.load_table(table_names[0]) df_table = table.to_pandas() assert isinstance(df_table, pd.DataFrame) ================================================ FILE: python/tests/test_sheet_visibility.py ================================================ import fastexcel from .utils import path_for_fixture def test_sheet_visibilities() -> None: file_path = path_for_fixture("fixture-sheets-different-visibilities.xlsx") reader = fastexcel.read_excel(file_path) assert reader.load_sheet(0).visible == "visible" assert reader.load_sheet(1).visible == "hidden" assert reader.load_sheet(2).visible == "veryhidden" ================================================ FILE: python/tests/test_shifted_data.py ================================================ import fastexcel from .utils import path_for_fixture def test_sheet_with_offset(): reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) sheet = reader.load_sheet("without-table") assert sheet.available_columns() == [ fastexcel.ColumnInfo( name="Column at H10", index=0, absolute_index=7, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="Column at I10", index=1, absolute_index=8, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), fastexcel.ColumnInfo( name="__UNNAMED__2", index=2, absolute_index=9, dtype="string", dtype_from="guessed", column_name_from="generated", ), fastexcel.ColumnInfo( name="Column at K10", index=3, absolute_index=10, dtype="float", dtype_from="guessed", column_name_from="looked_up", ), ] def test_table_with_offset(): reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) table = reader.load_table("TableAtD5") assert table.available_columns() == [ fastexcel.ColumnInfo( name="Column at D5", index=0, absolute_index=3, dtype="float", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="Column at E5", index=1, absolute_index=4, dtype="float", dtype_from="guessed", column_name_from="provided", ), ] ================================================ FILE: python/tests/test_tables.py ================================================ from datetime import datetime import fastexcel import pandas as pd import polars as pl import pytest from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import path_for_fixture @pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",)) def test_table_names(path: str) -> None: excel_reader = fastexcel.read_excel(path_for_fixture(path)) table_names = excel_reader.table_names() assert table_names == ["users"] @pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",)) def test_table_names_with_sheet_name(path: str) -> None: excel_reader = fastexcel.read_excel(path_for_fixture(path)) table_names = excel_reader.table_names("sheet1") assert table_names == ["users"] table_names = excel_reader.table_names("sheet2") assert table_names == [] @pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",)) def test_load_table(path: str) -> None: excel_reader = fastexcel.read_excel(path_for_fixture(path)) users_tbl = excel_reader.load_table("users") assert users_tbl.name == "users" assert users_tbl.sheet_name == "sheet1" assert users_tbl.specified_dtypes is None assert users_tbl.available_columns() == [ fastexcel.ColumnInfo( name="User Id", index=0, absolute_index=0, dtype="float", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="FirstName", index=1, absolute_index=1, dtype="string", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="LastName", index=2, absolute_index=2, dtype="string", dtype_from="guessed", column_name_from="provided", ), fastexcel.ColumnInfo( name="Date", index=3, absolute_index=3, dtype="datetime", dtype_from="guessed", column_name_from="provided", ), ] assert users_tbl.total_height == 3 assert users_tbl.offset == 0 assert users_tbl.height == 3 assert users_tbl.width == 4 expected_pl = pl.DataFrame( { "User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"], "LastName": ["Müller", "Meier", "Fricker"], "Date": [datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)], } ).with_columns(pl.col("Date").dt.cast_time_unit("ms")) pl_assert_frame_equal(users_tbl.to_polars(), expected_pl) expected_pd = pd.DataFrame( { "User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"], "LastName": ["Müller", "Meier", "Fricker"], "Date": pd.Series( [datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)] ).astype("datetime64[ms]"), } ) pd_assert_frame_equal(users_tbl.to_pandas(), expected_pd) table_eager = excel_reader.load_table("users", eager=True) pl_df = pl.from_arrow(table_eager) assert isinstance(pl_df, pl.DataFrame) pl_assert_frame_equal(pl_df, expected_pl) pd_assert_frame_equal(table_eager.to_pandas(), expected_pd) ================================================ FILE: python/tests/test_whitespace.py ================================================ import datetime import fastexcel import polars as pl from pandas.testing import assert_frame_equal as pd_assert_frame_equal from polars.testing import assert_frame_equal as pl_assert_frame_equal from .utils import path_for_fixture def test_skip_tail_whitespace_rows() -> None: """Test that skip_whitespace_tail_rows option works correctly.""" excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx")) # Expected data when NOT skipping whitespace tail rows expected_with_whitespace = pl.DataFrame( { "Column One": ["1", "2", "3", None, "5", None, None, None, None, " "], "Column Two": ["one", "two", None, "four", "five", None, None, "", None, None], "Column Three": [ datetime.datetime(2025, 11, 19, 14, 34, 2), datetime.datetime(2025, 11, 20, 14, 56, 34), datetime.datetime(2025, 11, 21, 15, 19, 6), None, datetime.datetime(2025, 11, 22, 15, 41, 38), datetime.datetime(2025, 11, 23, 16, 4, 10), None, None, None, None, ], } ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms")) # Expected data when skipping whitespace tail rows expected_without_whitespace = pl.DataFrame( { "Column One": [1.0, 2.0, 3.0, None, 5.0, None], "Column Two": ["one", "two", None, "four", "five", None], "Column Three": [ datetime.datetime(2025, 11, 19, 14, 34, 2), datetime.datetime(2025, 11, 20, 14, 56, 34), datetime.datetime(2025, 11, 21, 15, 19, 6), None, datetime.datetime(2025, 11, 22, 15, 41, 38), datetime.datetime(2025, 11, 23, 16, 4, 10), ], } ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms")) # Test sheet without skipping whitespace tail rows sheet_with_whitespace = excel_reader.load_sheet("Without Table") pl_assert_frame_equal(sheet_with_whitespace.to_polars(), expected_with_whitespace) # Test table without skipping whitespace tail rows table_with_whitespace = excel_reader.load_table("Table_with_whitespace") pl_assert_frame_equal(table_with_whitespace.to_polars(), expected_with_whitespace) # Test sheet with skipping whitespace tail rows sheet_without_whitespace = excel_reader.load_sheet( "Without Table", skip_whitespace_tail_rows=True ) pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace) # Test table with skipping whitespace tail rows table_without_whitespace = excel_reader.load_table( "Table_with_whitespace", skip_whitespace_tail_rows=True ) pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace) # Also verify pandas compatibility pd_assert_frame_equal( sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas() ) pd_assert_frame_equal( table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas() ) def test_skip_tail_rows_and_whitespace_as_null_behavior() -> None: excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx")) # Expected data when converting whitespace to null but not skipping tail rows expected_with_whitespace_as_null = pl.DataFrame( { # All rows should be taken into account but the space in the last row should be # considered null "Column One": [1.0, 2.0, 3.0, None, 5.0, None, None, None, None, None], # All rows should be taken into account but the empty string in 8th row should be # considered null "Column Two": ["one", "two", None, "four", "five", None, None, None, None, None], "Column Three": [ datetime.datetime(2025, 11, 19, 14, 34, 2), datetime.datetime(2025, 11, 20, 14, 56, 34), datetime.datetime(2025, 11, 21, 15, 19, 6), None, datetime.datetime(2025, 11, 22, 15, 41, 38), datetime.datetime(2025, 11, 23, 16, 4, 10), None, None, None, None, ], } ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms")) # Expected data when converting whitespace to null and skipping tail rows expected_without_whitespace = pl.DataFrame( { "Column One": [1.0, 2.0, 3.0, None, 5.0, None], "Column Two": ["one", "two", None, "four", "five", None], "Column Three": [ datetime.datetime(2025, 11, 19, 14, 34, 2), datetime.datetime(2025, 11, 20, 14, 56, 34), datetime.datetime(2025, 11, 21, 15, 19, 6), None, datetime.datetime(2025, 11, 22, 15, 41, 38), datetime.datetime(2025, 11, 23, 16, 4, 10), ], } ).with_columns(pl.col("Column Three").dt.cast_time_unit("ms")) # Test sheet with whitespace_as_null but not skipping tail rows sheet_with_whitespace_as_null = excel_reader.load_sheet( "Without Table", whitespace_as_null=True ) pl_assert_frame_equal( sheet_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null ) # Test table with whitespace_as_null but not skipping tail rows table_with_whitespace_as_null = excel_reader.load_table( "Table_with_whitespace", whitespace_as_null=True ) pl_assert_frame_equal( table_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null ) # Test sheet with both whitespace_as_null and skip_whitespace_tail_rows sheet_without_whitespace = excel_reader.load_sheet( "Without Table", whitespace_as_null=True, skip_whitespace_tail_rows=True ) pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace) # Test table with both whitespace_as_null and skip_whitespace_tail_rows table_without_whitespace = excel_reader.load_table( "Table_with_whitespace", whitespace_as_null=True, skip_whitespace_tail_rows=True ) pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace) # Also verify pandas compatibility pd_assert_frame_equal( sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas() ) pd_assert_frame_equal( sheet_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas() ) pd_assert_frame_equal( table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas() ) pd_assert_frame_equal( table_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas() ) ================================================ FILE: python/tests/utils.py ================================================ from __future__ import annotations from pathlib import Path from typing import Any import numpy as np import pandas as pd def path_for_fixture(fixture_file: str) -> str: return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file) def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any: """Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version. In pandas < 3.0, string columns use object dtype. In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow). """ pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2]) dtype_map = { "int": np.dtype("int64"), "float": np.dtype("float64"), "boolean": np.dtype("bool"), "datetime": np.dtype("datetime64[ms]"), "duration": np.dtype("timedelta64[ms]"), } if fastexcel_dtype in dtype_map: return dtype_map[fastexcel_dtype] if fastexcel_dtype == "string": if pd_version >= (3, 0): # When converting from Arrow, pandas uses nan as na_value return pd.StringDtype(na_value=np.nan) else: return np.dtype("object") if fastexcel_dtype == "date": # Date columns are always object dtype return np.dtype("object") raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}") def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None: """Assert that a pandas DataFrame has the expected dtypes for each column. Args: df: The pandas DataFrame to check expected_dtypes: A dict mapping column names to fastexcel dtype strings """ for col_name, fastexcel_dtype in expected_dtypes.items(): expected_dtype = get_expected_pandas_dtype(fastexcel_dtype) actual_dtype = df[col_name].dtype assert actual_dtype == expected_dtype, ( f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}" ) ================================================ FILE: scripts/update_versions.py ================================================ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.9" # dependencies = [] # /// """Manage docs/versions.json and generate the root docs/index.html redirect.""" from __future__ import annotations import argparse import json import re from pathlib import Path def parse_semver(version: str) -> tuple[int, ...]: """Extract numeric parts from a version string like 'v0.19.0'.""" return tuple(int(x) for x in re.findall(r"\d+", version)) def sort_versions(versions: list[dict]) -> list[dict]: """Sort: stable first, then tags descending by semver, 'latest' last.""" def sort_key(v: dict) -> tuple[int, tuple[int, ...], str]: path = v["path"] if v.get("stable"): return (0, (), "") if path == "latest": return (2, (), "") return (1, tuple(-x for x in parse_semver(path)), path) return sorted(versions, key=sort_key) def update_versions(docs_dir: Path, version: str, *, stable: bool) -> None: if not re.fullmatch(r"latest|v\d+(\.\d+)*", version): raise ValueError(f"Invalid version '{version}': must be 'latest' or match 'v' (e.g. v0.19.0)") versions_file = docs_dir / "versions.json" if versions_file.exists(): versions = json.loads(versions_file.read_text()) else: versions = [] # Build label if version == "latest": label = "latest (main)" elif stable: label = f"{version} (stable)" else: label = version # Remove old entry for this version, and clear stable flag from others if # this one is now stable new_versions = [] for v in versions: if v["path"] == version: continue if stable and v.get("stable"): v = {**v, "stable": False, "label": v["path"]} new_versions.append(v) new_versions.append({"label": label, "path": version, "stable": stable}) new_versions = sort_versions(new_versions) versions_file.write_text(json.dumps(new_versions, indent=2) + "\n") # Generate root redirect stable_entry = next((v for v in new_versions if v.get("stable")), None) redirect_path = stable_entry["path"] if stable_entry else version index_html = docs_dir / "index.html" index_html.write_text( f"""\

Redirecting to {redirect_path} documentation...

""" ) def main() -> None: parser = argparse.ArgumentParser(description="Update docs versions.json") parser.add_argument("--version", required=True, help="Version name (e.g. v0.19.0 or latest)") parser.add_argument("--stable", action="store_true", help="Mark this version as the stable default") parser.add_argument("--docs-dir", default="docs", help="Path to the docs directory") args = parser.parse_args() update_versions(Path(args.docs_dir), args.version, stable=args.stable) if __name__ == "__main__": main() ================================================ FILE: src/data/cell_extractors.rs ================================================ use calamine::{CellType, DataType}; use chrono::{NaiveDate, NaiveDateTime, TimeDelta}; use crate::types::dtype::excel_float_to_string; pub(super) fn extract_boolean(cell: &DT) -> Option { if let Some(b) = cell.get_bool() { Some(b) } else if let Some(i) = cell.get_int() { Some(i != 0) } // clippy formats else if let Some(blah) = ... { Some(x) } else { None } to the .map form else { cell.get_float().map(|f| f != 0.0) } } pub(super) fn extract_int(cell: &DT) -> Option { cell.as_i64() } pub(super) fn extract_float(cell: &DT) -> Option { cell.as_f64() } pub(super) fn extract_string(cell: &DT) -> Option { if cell.is_string() { cell.get_string().map(str::to_string) } else if cell.is_datetime() { cell.get_datetime() .and_then(|dt| dt.as_datetime()) .map(|dt| dt.to_string()) } else if cell.is_datetime_iso() { cell.get_datetime_iso().map(str::to_string) } else if cell.is_bool() { cell.get_bool().map(|v| v.to_string()) } else if cell.is_float() { cell.get_float().map(excel_float_to_string) } else { cell.as_string() } } pub(super) fn extract_date(cell: &DT) -> Option { cell.as_date() } #[cfg(feature = "python")] const EPOCH: NaiveDate = NaiveDate::from_ymd_opt(1970, 1, 1).expect("Failed to create EPOCH"); #[cfg(feature = "python")] pub(super) fn extract_date_as_num_days(cell: &DT) -> Option { extract_date(cell) .and_then(|date| i32::try_from(date.signed_duration_since(EPOCH).num_days()).ok()) } pub(super) fn extract_datetime(cell: &DT) -> Option { cell.as_datetime() } #[cfg(feature = "python")] pub(super) fn extract_datetime_as_timestamp_ms(cell: &DT) -> Option { extract_datetime(cell).map(|dt| dt.and_utc().timestamp_millis()) } pub(super) fn extract_duration(cell: &DT) -> Option { cell.as_duration() } #[cfg(feature = "python")] pub(super) fn extract_duration_as_ms(cell: &DT) -> Option { extract_duration(cell).map(|d| d.num_milliseconds()) } ================================================ FILE: src/data/mod.rs ================================================ mod cell_extractors; #[cfg(feature = "python")] mod python; mod rust; use chrono::{Duration, NaiveDate, NaiveDateTime}; #[cfg(feature = "python")] pub(crate) use python::*; use calamine::{CellType, Data as CalData, DataRef as CalDataRef, DataType, Range}; use crate::{ data::rust::{ create_boolean_vec, create_date_vec, create_datetime_vec, create_duration_vec, create_float_vec, create_int_vec, create_string_vec, }, error::{FastExcelErrorKind, FastExcelResult}, types::{ dtype::{DType, DTypeCoercion, get_dtype_for_column}, excelsheet::{SkipRows, column_info::ColumnInfo}, }, }; #[derive(Debug)] pub(crate) enum ExcelSheetData<'r> { Owned(Range), Ref(Range>), } impl ExcelSheetData<'_> { pub(crate) fn width(&self) -> usize { match self { ExcelSheetData::Owned(range) => range.width(), ExcelSheetData::Ref(range) => range.width(), } } pub(crate) fn height(&self) -> usize { match self { ExcelSheetData::Owned(range) => range.height(), ExcelSheetData::Ref(range) => range.height(), } } pub(super) fn get_as_string(&self, pos: (usize, usize)) -> Option { match self { ExcelSheetData::Owned(range) => range.get(pos).and_then(|data| data.as_string()), ExcelSheetData::Ref(range) => range.get(pos).and_then(|data| data.as_string()), } } pub(crate) fn dtype_for_column( &self, start_row: usize, end_row: usize, col: usize, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult { match self { ExcelSheetData::Owned(data) => get_dtype_for_column( data, start_row, end_row, col, dtype_coercion, whitespace_as_null, ), ExcelSheetData::Ref(data) => get_dtype_for_column( data, start_row, end_row, col, dtype_coercion, whitespace_as_null, ), } } pub(crate) fn height_without_tail_whitespace(&self) -> usize { match self { ExcelSheetData::Owned(data) => { height_without_tail_whitespace(data).unwrap_or_else(|| data.height()) } ExcelSheetData::Ref(data) => { height_without_tail_whitespace(data).unwrap_or_else(|| data.height()) } } } pub(crate) fn start(&self) -> Option<(usize, usize)> { let start = match self { ExcelSheetData::Owned(range) => range.start(), ExcelSheetData::Ref(range) => range.start(), }; start.map(|(r, c)| (r as usize, c as usize)) } } impl From> for ExcelSheetData<'_> { fn from(range: Range) -> Self { Self::Owned(range) } } impl<'a> From>> for ExcelSheetData<'a> { fn from(range: Range>) -> Self { Self::Ref(range) } } trait CellIsWhiteSpace { fn is_whitespace(&self) -> bool; } impl CellIsWhiteSpace for T where T: DataType, { fn is_whitespace(&self) -> bool { if self.is_empty() { true } else if self.is_string() && let Some(s) = self.get_string() { s.trim().is_empty() } else { false } } } pub(crate) fn height_without_tail_whitespace( data: &Range, ) -> Option { let height = data.height(); let width = data.width(); if height < 1 { return Some(0); } if width < 1 { return None; } (0..width) .map(|col_idx| { let mut row_idx = height - 1; // Start at the bottom of the column and work upwards until we find a non-empty cell while row_idx > 0 && data .get((row_idx, col_idx)) .map(CellIsWhiteSpace::is_whitespace) .unwrap_or(true) { row_idx -= 1; } row_idx + 1 }) .max() } /// A container for a typed vector of values. Used to represent a column of data in an Excel sheet. /// These should only be used when you need to work on the raw data. Otherwise, you should use a /// `FastExcelColumn`. #[derive(Debug, Clone, PartialEq)] pub enum FastExcelSeries { Null, Bool(Vec>), String(Vec>), Int(Vec>), Float(Vec>), Datetime(Vec>), Date(Vec>), Duration(Vec>), } impl FastExcelSeries { pub fn dtype(&self) -> DType { match self { FastExcelSeries::Null => DType::Null, FastExcelSeries::Bool(_) => DType::Bool, FastExcelSeries::String(_) => DType::String, FastExcelSeries::Int(_) => DType::Int, FastExcelSeries::Float(_) => DType::Float, FastExcelSeries::Datetime(_) => DType::DateTime, FastExcelSeries::Date(_) => DType::Date, FastExcelSeries::Duration(_) => DType::Duration, } } pub fn is_null(&self) -> bool { matches!(self, FastExcelSeries::Null) } } macro_rules! impl_series_variant { ($type:ty, $variant:ident, $into_fn:ident) => { impl From>> for FastExcelSeries { fn from(vec: Vec>) -> Self { Self::$variant(vec) } } impl From<[Option<$type>; N]> for FastExcelSeries { fn from(arr: [Option<$type>; N]) -> Self { Self::$variant(arr.to_vec()) } } impl From<[$type; N]> for FastExcelSeries { fn from(arr: [$type; N]) -> Self { Self::$variant(arr.into_iter().map(Some).collect()) } } impl From<&[$type]> for FastExcelSeries { fn from(arr: &[$type]) -> Self { Self::$variant(arr.into_iter().map(|it| Some(it.to_owned())).collect()) } } impl From<&[Option<$type>]> for FastExcelSeries { fn from(arr: &[Option<$type>]) -> Self { Self::$variant(arr.into_iter().map(ToOwned::to_owned).collect()) } } // Not implementing is_empty here, because we have no len information for null Series impl FastExcelSeries { pub fn $into_fn(self) -> FastExcelResult>> { if let Self::$variant(vec) = self { Ok(vec) } else { Err(FastExcelErrorKind::InvalidParameters(format!( "{self:?} cannot be converted to {type_name}", type_name = std::any::type_name::<$type>() )) .into()) } } } }; } impl_series_variant!(bool, Bool, into_bools); impl_series_variant!(String, String, into_strings); impl_series_variant!(i64, Int, into_ints); impl_series_variant!(f64, Float, into_floats); impl_series_variant!(NaiveDateTime, Datetime, into_datetimes); impl_series_variant!(NaiveDate, Date, into_dates); impl_series_variant!(Duration, Duration, into_durations); // Conflicting impls when using `From>` impl From<[Option<&str>; N]> for FastExcelSeries { fn from(arr: [Option<&str>; N]) -> Self { Self::String(arr.into_iter().map(|s| s.map(|s| s.to_string())).collect()) } } impl From<[&str; N]> for FastExcelSeries { fn from(arr: [&str; N]) -> Self { Self::String(arr.into_iter().map(|s| Some(s.to_string())).collect()) } } /// A column in a sheet or table. A wrapper around a `FastExcelSeries` and a name. #[derive(Debug, Clone, PartialEq)] pub struct FastExcelColumn { pub name: String, pub(crate) data: FastExcelSeries, len: usize, } impl FastExcelColumn { pub fn try_new( name: String, data: FastExcelSeries, len: Option, ) -> FastExcelResult { let data_len = match &data { FastExcelSeries::Null => None, FastExcelSeries::Bool(v) => Some(v.len()), FastExcelSeries::String(v) => Some(v.len()), FastExcelSeries::Int(v) => Some(v.len()), FastExcelSeries::Float(v) => Some(v.len()), FastExcelSeries::Datetime(v) => Some(v.len()), FastExcelSeries::Date(v) => Some(v.len()), FastExcelSeries::Duration(v) => Some(v.len()), }; if let Some(len) = len && let Some(data_len) = data_len && data_len != len { return Err(FastExcelErrorKind::InvalidColumn(format!( "Column '{name}' has length {data_len} but expected {len}" )) .into()); } let len = len.or(data_len).ok_or_else(|| { FastExcelErrorKind::InvalidColumn( "`len` is mandatory for `FastExcelSeries::Null`".to_string(), ) })?; Ok(Self { name, data, len }) } /// Create a new null series with the given name and length. pub fn new_null>(name: S, len: usize) -> Self { Self { name: name.into(), data: FastExcelSeries::Null, len, } } pub(crate) fn try_from_column_info( column_info: &ColumnInfo, data: &Range, offset: usize, limit: usize, whitespace_as_null: bool, ) -> FastExcelResult { let len = limit.checked_sub(offset).ok_or_else(|| { FastExcelErrorKind::InvalidParameters(format!( "limit is smaller than offset: {limit} is smaller than {offset}" )) })?; let data = match column_info.dtype { DType::Null => FastExcelSeries::Null, DType::Int => { FastExcelSeries::Int(create_int_vec(data, column_info.index, offset, limit)) } DType::Float => { FastExcelSeries::Float(create_float_vec(data, column_info.index, offset, limit)) } DType::String => FastExcelSeries::String(create_string_vec( data, column_info.index, offset, limit, whitespace_as_null, )), DType::Bool => { FastExcelSeries::Bool(create_boolean_vec(data, column_info.index, offset, limit)) } DType::DateTime => FastExcelSeries::Datetime(create_datetime_vec( data, column_info.index, offset, limit, )), DType::Date => { FastExcelSeries::Date(create_date_vec(data, column_info.index, offset, limit)) } DType::Duration => FastExcelSeries::Duration(create_duration_vec( data, column_info.index, offset, limit, )), }; Ok(Self { name: column_info.name.clone(), data, len, }) } pub fn len(&self) -> usize { self.len } pub fn is_empty(&self) -> bool { self.len == 0 } pub fn name(&self) -> &str { &self.name } pub fn data(&self) -> &FastExcelSeries { &self.data } } impl From for FastExcelSeries { fn from(column: FastExcelColumn) -> Self { column.data } } /// Enum for lazy row selection - avoids materializing Vec for simple cases #[derive(Debug)] pub(crate) enum RowSelector { /// Simple range - no Vec allocation needed Range(std::ops::Range), /// Pre-filtered list of specific row indices Filtered(Vec), } impl RowSelector { pub(crate) fn len(&self) -> usize { match self { RowSelector::Range(range) => range.len(), RowSelector::Filtered(vec) => vec.len(), } } } /// Generate row selector based on [`SkipRows`] and range limits pub(crate) fn generate_row_selector( skip_rows: &SkipRows, offset: usize, limit: usize, ) -> FastExcelResult { match skip_rows { SkipRows::Simple(_skip_count) => { // For simple case, the offset has already been adjusted by pagination logic // So we just return the normal range - no Vec allocation! Ok(RowSelector::Range(offset..limit)) } SkipRows::SkipEmptyRowsAtBeginning => { // For empty rows at beginning, calamine handles this at the header level // So we just return the normal range - no Vec allocation! Ok(RowSelector::Range(offset..limit)) } SkipRows::List(skip_set) => { // Filter out rows that are in the skip set // `skip_set` contains data-relative indices, but we need to work with absolute indices let filtered: Vec = (offset..limit) .enumerate() .filter_map(|(data_row_idx, absolute_row_idx)| { (!skip_set.contains(&data_row_idx)).then_some(absolute_row_idx) }) .collect(); Ok(RowSelector::Filtered(filtered)) } #[cfg(feature = "python")] SkipRows::Callable(_func) => { // Call the Python function for each row to determine if it should be skipped // The callable should receive data-relative row indices (0, 1, 2, ...) pyo3::Python::attach(|py| { Ok(RowSelector::Filtered( (offset..limit) .enumerate() .filter_map(|(data_row_idx, absolute_row_idx)| { (!skip_rows.should_skip_row(data_row_idx, py).unwrap_or(false)) .then_some(absolute_row_idx) }) .collect(), )) }) } } } ================================================ FILE: src/data/python.rs ================================================ use std::sync::Arc; use std::{fmt::Debug, ops::Not}; use arrow_array::{ Array, ArrayRef, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array, NullArray, RecordBatch, StringArray, TimestampMillisecondArray, }; use arrow_schema::{Field, Schema}; use calamine::{CellType, DataType, Range}; use super::cell_extractors; use crate::{ data::{ExcelSheetData, RowSelector, generate_row_selector}, error::{ErrorContext, FastExcelErrorKind, FastExcelResult}, types::{ dtype::DType, excelsheet::{CellError, CellErrors, SkipRows, column_info::ColumnInfo}, }, }; mod with_error_impls { use super::*; pub(crate) fn create_boolean_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(BooleanArray::from_iter((offset..limit).map(|row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else if let Some(b) = cell_extractors::extract_boolean(cell) { Some(b) } else { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected boolean but got '{cell:?}"), }); None } }) }))); (arr, cell_errors) } pub(crate) fn create_int_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(Int64Array::from_iter((offset..limit).map(|row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_int(cell) { Some(value) => Some(value), None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected int but got '{cell:?}'"), }); None } } } }) }))); (arr, cell_errors) } pub(crate) fn create_float_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(Float64Array::from_iter((offset..limit).map(|row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_float(cell) { Some(value) => Some(value), None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected float but got '{cell:?}'"), }); None } } } }) }))); (arr, cell_errors) } pub(crate) fn create_string_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, whitespace_as_null: bool, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(StringArray::from_iter((offset..limit).map(|row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_string(cell) { Some(value) => { if whitespace_as_null && value.trim().is_empty() { None } else { Some(value) } } None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected string but got '{cell:?}'"), }); None } } } }) }))); (arr, cell_errors) } pub(crate) fn create_date_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(Date32Array::from_iter((offset..limit).map(|row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_date_as_num_days(cell) { Some(value) => Some(value), None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected date but got '{:?}'", cell), }); None } } } }) }))); (arr, cell_errors) } pub(crate) fn create_datetime_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(TimestampMillisecondArray::from_iter((offset..limit).map( |row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_datetime_as_timestamp_ms(cell) { Some(value) => Some(value), None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected datetime but got '{:?}'", cell), }); None } } } }) }, ))); (arr, cell_errors) } pub(crate) fn create_duration_array_with_errors( data: &Range, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { let mut cell_errors = vec![]; let arr = Arc::new(DurationMillisecondArray::from_iter((offset..limit).map( |row| { data.get((row, col)).and_then(|cell| { if cell.is_empty() { None } else { match cell_extractors::extract_duration_as_ms(cell) { Some(value) => Some(value), None => { cell_errors.push(CellError { position: (row, col), row_offset: offset, detail: format!("Expected duration but got '{cell:?}'"), }); None } } } }) }, ))); (arr, cell_errors) } } pub(crate) fn create_boolean_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(BooleanArray::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_boolean) }))) } pub(crate) fn create_int_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(Int64Array::from_iter(row_iter.map(|row| { data.get((row, col)).and_then(cell_extractors::extract_int) }))) } pub(crate) fn create_float_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(Float64Array::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_float) }))) } pub(crate) fn create_string_array( data: &Range, col: usize, row_iter: impl Iterator, whitespace_as_null: bool, ) -> Arc { Arc::new(if whitespace_as_null { StringArray::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_string) // Only return the string if it contains non-whitespace characters .filter(|s| s.trim().is_empty().not()) })) } else { StringArray::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_string) })) }) } pub(crate) fn create_date_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(Date32Array::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_date_as_num_days) }))) } pub(crate) fn create_datetime_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(TimestampMillisecondArray::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_datetime_as_timestamp_ms) }))) } pub(crate) fn create_duration_array( data: &Range, col: usize, row_iter: impl Iterator, ) -> Arc { Arc::new(DurationMillisecondArray::from_iter(row_iter.map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_duration_as_ms) }))) } macro_rules! create_array_function_with_errors { ($func_name:ident) => { pub(crate) fn $func_name( data: &ExcelSheetData, col: usize, offset: usize, limit: usize, ) -> (Arc, Vec) { match data { ExcelSheetData::Owned(range) => { with_error_impls::$func_name(range, col, offset, limit) } ExcelSheetData::Ref(range) => { with_error_impls::$func_name(range, col, offset, limit) } } } }; } create_array_function_with_errors!(create_boolean_array_with_errors); create_array_function_with_errors!(create_int_array_with_errors); create_array_function_with_errors!(create_float_array_with_errors); create_array_function_with_errors!(create_date_array_with_errors); create_array_function_with_errors!(create_datetime_array_with_errors); create_array_function_with_errors!(create_duration_array_with_errors); pub(crate) fn create_string_array_with_errors( data: &ExcelSheetData, col: usize, offset: usize, limit: usize, whitespace_as_null: bool, ) -> (Arc, Vec) { match data { ExcelSheetData::Owned(range) => with_error_impls::create_string_array_with_errors( range, col, offset, limit, whitespace_as_null, ), ExcelSheetData::Ref(range) => with_error_impls::create_string_array_with_errors( range, col, offset, limit, whitespace_as_null, ), } } /// Converts a list of ColumnInfo to an arrow Schema pub(crate) fn selected_columns_to_schema(columns: &[ColumnInfo]) -> Schema { let fields: Vec<_> = columns.iter().map(Into::::into).collect(); Schema::new(fields) } /// Creates an arrow RecordBatch from an Iterator over (column_name, column data tuples) and an arrow schema pub(crate) fn record_batch_from_name_array_iterator< 'a, I: Iterator)>, >( iter: I, schema: Schema, ) -> FastExcelResult { let mut iter = iter.peekable(); // If the iterable is empty, try_from_iter returns an Err if iter.peek().is_none() { Ok(RecordBatch::new_empty(Arc::new(schema))) } else { // We use `try_from_iter_with_nullable` because `try_from_iter` relies on `array.null_count() > 0;` // to determine if the array is nullable. This is not the case for `NullArray` which has no nulls. RecordBatch::try_from_iter_with_nullable(iter.map(|(field_name, array)| { let nullable = array.is_nullable(); (field_name, array, nullable) })) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) .with_context(|| "could not create RecordBatch from iterable") } } /// Creates an arrow `RecordBatch` from `ExcelSheetData`. Expects the following parameters: /// * `columns`: a slice of `ColumnInfo`, representing the columns that should be extracted from the range /// * `data`: the sheets data, as an `ExcelSheetData` /// * `offset`: the row index at which to start /// * `limit`: the row index at which to stop (excluded) pub(crate) fn record_batch_from_data_and_columns( columns: &[ColumnInfo], data: &Range, offset: usize, limit: usize, whitespace_as_null: bool, ) -> FastExcelResult { // Use RowSelector::Range for simple offset..limit case - no Vec allocation! let row_selector = RowSelector::Range(offset..limit); record_batch_from_data_and_columns_with_row_selector( columns, data, &row_selector, whitespace_as_null, ) } pub(crate) fn record_batch_from_data_and_columns_with_skip_rows( columns: &[ColumnInfo], data: &Range, skip_rows: &SkipRows, offset: usize, limit: usize, whitespace_as_null: bool, ) -> FastExcelResult { // Generate row selector - ranges for simple cases, filtered Vec only when needed let row_selector = generate_row_selector(skip_rows, offset, limit)?; record_batch_from_data_and_columns_with_row_selector( columns, data, &row_selector, whitespace_as_null, ) } fn record_batch_from_data_and_columns_with_row_selector( columns: &[ColumnInfo], data: &Range, row_selector: &RowSelector, whitespace_as_null: bool, ) -> FastExcelResult { let schema = selected_columns_to_schema(columns); let row_count = row_selector.len(); let iter = columns.iter().map(|column_info| { let col_idx = column_info.index; let dtype = column_info.dtype; ( column_info.name.as_str(), match dtype { DType::Null => Arc::new(NullArray::new(row_count)), DType::Int => create_int_array(data, col_idx, row_selector.iter()), DType::Float => create_float_array(data, col_idx, row_selector.iter()), DType::String => { create_string_array(data, col_idx, row_selector.iter(), whitespace_as_null) } DType::Bool => create_boolean_array(data, col_idx, row_selector.iter()), DType::DateTime => create_datetime_array(data, col_idx, row_selector.iter()), DType::Date => create_date_array(data, col_idx, row_selector.iter()), DType::Duration => create_duration_array(data, col_idx, row_selector.iter()), }, ) }); record_batch_from_name_array_iterator(iter, schema) } pub(crate) fn record_batch_from_data_and_columns_with_errors( columns: &[ColumnInfo], data: &ExcelSheetData, offset: usize, limit: usize, whitespace_as_null: bool, ) -> FastExcelResult<(RecordBatch, CellErrors)> { let schema = selected_columns_to_schema(columns); let mut cell_errors = vec![]; let iter = columns.iter().map(|column_info| { let col_idx = column_info.index; let dtype = column_info.dtype; let (array, new_cell_errors) = match dtype { DType::Null => (Arc::new(NullArray::new(limit - offset)) as ArrayRef, vec![]), DType::Int => create_int_array_with_errors(data, col_idx, offset, limit), DType::Float => create_float_array_with_errors(data, col_idx, offset, limit), DType::String => { create_string_array_with_errors(data, col_idx, offset, limit, whitespace_as_null) } DType::Bool => create_boolean_array_with_errors(data, col_idx, offset, limit), DType::DateTime => create_datetime_array_with_errors(data, col_idx, offset, limit), DType::Date => create_date_array_with_errors(data, col_idx, offset, limit), DType::Duration => create_duration_array_with_errors(data, col_idx, offset, limit), }; cell_errors.extend(new_cell_errors); (column_info.name.as_str(), array) }); let record_batch = record_batch_from_name_array_iterator(iter, schema)?; Ok(( record_batch, CellErrors { errors: cell_errors, }, )) } impl RowSelector { pub(crate) fn iter(&self) -> Box + '_> { match self { RowSelector::Range(range) => Box::new(range.clone()), RowSelector::Filtered(vec) => Box::new(vec.iter().copied()), } } } ================================================ FILE: src/data/rust.rs ================================================ use std::ops::Not; use calamine::{CellType, DataType, Range}; use chrono::{NaiveDate, NaiveDateTime, TimeDelta}; use super::cell_extractors; pub(crate) fn create_boolean_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_boolean) }) .collect() } pub(crate) fn create_int_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| data.get((row, col)).and_then(cell_extractors::extract_int)) .collect() } pub(crate) fn create_float_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_float) }) .collect() } pub(crate) fn create_string_vec( data: &Range, col: usize, offset: usize, limit: usize, whitespace_as_null: bool, ) -> Vec> { if whitespace_as_null { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_string) // Only return the string if it contains non-whitespace characters .filter(|s| s.trim().is_empty().not()) }) .collect() } else { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_string) }) .collect() } } pub(crate) fn create_date_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| data.get((row, col)).and_then(cell_extractors::extract_date)) .collect() } pub(crate) fn create_datetime_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_datetime) }) .collect() } pub(crate) fn create_duration_vec( data: &Range, col: usize, offset: usize, limit: usize, ) -> Vec> { (offset..limit) .map(|row| { data.get((row, col)) .and_then(cell_extractors::extract_duration) }) .collect() } ================================================ FILE: src/error.rs ================================================ use crate::types::idx_or_name::IdxOrName; use calamine::XlsxError; use std::{error::Error, fmt::Display}; /// The kind of a fastexcel error. #[derive(Debug)] pub enum FastExcelErrorKind { UnsupportedColumnTypeCombination(String), CannotRetrieveCellData(usize, usize), CalamineCellError(calamine::CellErrorType), CalamineError(calamine::Error), SheetNotFound(IdxOrName), ColumnNotFound(IdxOrName), // Arrow errors can be of several different types (arrow::error::Error, PyError), and having // the actual type has not much value for us, so we just store a string context ArrowError(String), InvalidParameters(String), InvalidColumn(String), Internal(String), } impl Display for FastExcelErrorKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { FastExcelErrorKind::UnsupportedColumnTypeCombination(detail) => { write!(f, "unsupported column type combination: {detail}") } FastExcelErrorKind::CannotRetrieveCellData(row, col) => { write!(f, "cannot retrieve cell data at ({row}, {col})") } FastExcelErrorKind::CalamineCellError(calamine_error) => { write!(f, "calamine cell error: {calamine_error}") } FastExcelErrorKind::CalamineError(calamine_error) => { write!(f, "calamine error: {calamine_error}") } FastExcelErrorKind::SheetNotFound(idx_or_name) => { let message = idx_or_name.format_message(); write!(f, "sheet {message} not found") } FastExcelErrorKind::ColumnNotFound(idx_or_name) => { let message = idx_or_name.format_message(); write!(f, "column {message} not found") } FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"), FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"), FastExcelErrorKind::InvalidColumn(err) => write!(f, "invalid column: {err}"), FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"), } } } /// A `fastexcel` error. /// /// Contains a kind and a context. Use the `Display` trait to format the /// error message with its context. #[derive(Debug)] pub struct FastExcelError { pub kind: FastExcelErrorKind, pub context: Vec, } pub(crate) trait ErrorContext { fn with_context(self, ctx_fn: F) -> Self where F: FnOnce() -> S; } impl FastExcelError { pub(crate) fn new(kind: FastExcelErrorKind) -> Self { Self { kind, context: vec![], } } } impl Display for FastExcelError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{kind}", kind = self.kind)?; if !self.context.is_empty() { writeln!(f, "\nContext:")?; self.context .iter() .enumerate() .try_for_each(|(idx, ctx_value)| writeln!(f, " {idx}: {ctx_value}"))?; } Ok(()) } } impl Error for FastExcelError {} impl ErrorContext for FastExcelError { fn with_context(mut self, ctx_fn: F) -> Self where F: FnOnce() -> S, { self.context.push(ctx_fn().to_string()); self } } impl From for FastExcelError { fn from(kind: FastExcelErrorKind) -> Self { FastExcelError::new(kind) } } impl From for FastExcelError { fn from(err: XlsxError) -> Self { FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into() } } pub type FastExcelResult = Result; impl ErrorContext for FastExcelResult { fn with_context(self, ctx_fn: F) -> Self where F: FnOnce() -> S, { match self { Ok(_) => self, Err(e) => Err(e.with_context(ctx_fn)), } } } /// Contains Python versions of our custom errors #[cfg(feature = "python")] pub(crate) mod py_errors { use super::FastExcelErrorKind; use crate::error; use pyo3::{PyErr, PyResult, create_exception, exceptions::PyException}; // Base fastexcel error create_exception!( _fastexcel, FastExcelError, PyException, "The base class for all fastexcel errors" ); // Unsupported column type create_exception!( _fastexcel, UnsupportedColumnTypeCombinationError, FastExcelError, "Column contains an unsupported type combination" ); // Cannot retrieve cell data create_exception!( _fastexcel, CannotRetrieveCellDataError, FastExcelError, "Data for a given cell cannot be retrieved" ); // Calamine cell error create_exception!( _fastexcel, CalamineCellError, FastExcelError, "calamine returned an error regarding the content of the cell" ); // Calamine error create_exception!( _fastexcel, CalamineError, FastExcelError, "Generic calamine error" ); // Sheet not found create_exception!( _fastexcel, SheetNotFoundError, FastExcelError, "Sheet was not found" ); // Sheet not found create_exception!( _fastexcel, ColumnNotFoundError, FastExcelError, "Column was not found" ); // Arrow error create_exception!( _fastexcel, ArrowError, FastExcelError, "Generic arrow error" ); // Invalid parameters create_exception!( _fastexcel, InvalidParametersError, FastExcelError, "Provided parameters are invalid" ); // Invalid column create_exception!( _fastexcel, InvalidColumnError, FastExcelError, "Column is invalid" ); // Internal error create_exception!( _fastexcel, InternalError, FastExcelError, "Internal fastexcel error" ); impl From for PyErr { fn from(err: error::FastExcelError) -> Self { let message = err.to_string(); match err.kind { FastExcelErrorKind::UnsupportedColumnTypeCombination(_) => { UnsupportedColumnTypeCombinationError::new_err(message) } FastExcelErrorKind::CannotRetrieveCellData(_, _) => { CannotRetrieveCellDataError::new_err(message) } FastExcelErrorKind::CalamineCellError(_) => CalamineCellError::new_err(message), FastExcelErrorKind::CalamineError(_) => CalamineError::new_err(message), FastExcelErrorKind::SheetNotFound(_) => SheetNotFoundError::new_err(message), FastExcelErrorKind::ColumnNotFound(_) => ColumnNotFoundError::new_err(message), FastExcelErrorKind::ArrowError(_) => ArrowError::new_err(message), FastExcelErrorKind::InvalidParameters(_) => { InvalidParametersError::new_err(message) } FastExcelErrorKind::InvalidColumn(_) => InvalidColumnError::new_err(message), FastExcelErrorKind::Internal(_) => ArrowError::new_err(message), } } } pub(crate) trait IntoPyResult { type Inner; fn into_pyresult(self) -> PyResult; } impl IntoPyResult for super::FastExcelResult { type Inner = T; fn into_pyresult(self) -> PyResult { self.map_err(Into::into) } } } ================================================ FILE: src/lib.rs ================================================ mod data; mod error; mod types; mod utils; use std::fmt::Display; #[cfg(feature = "python")] use error::py_errors; #[cfg(feature = "python")] use pyo3::prelude::*; #[cfg(feature = "python")] use types::excelsheet::{CellError, CellErrors}; pub use data::{FastExcelColumn, FastExcelSeries}; use error::ErrorContext; pub use error::{FastExcelError, FastExcelErrorKind, FastExcelResult}; pub use types::{ ColumnInfo, ColumnNameFrom, DType, DTypeCoercion, DTypeFrom, DTypes, DefinedName, ExcelReader, ExcelSheet, ExcelTable, IdxOrName, LoadSheetOrTableOptions, SelectedColumns, SheetVisible, SkipRows, }; /// Reads an excel file and returns an object allowing to access its sheets, tables, and a bit of metadata. /// This is a wrapper around `ExcelReader::try_from_path`. pub fn read_excel + Display>(path: S) -> FastExcelResult { ExcelReader::try_from_path(path.as_ref()) .with_context(|| format!("could not load excel file at {path}")) } #[cfg(feature = "python")] /// Reads an excel file and returns an object allowing to access its sheets, tables, and a bit of metadata #[pyfunction(name = "read_excel")] fn py_read_excel<'py>(source: &Bound<'_, PyAny>, py: Python<'py>) -> PyResult { use py_errors::IntoPyResult; if let Ok(path) = source.extract::() { py.detach(|| ExcelReader::try_from_path(&path)) .with_context(|| format!("could not load excel file at {path}")) .into_pyresult() } else if let Ok(bytes) = source.extract::<&[u8]>() { py.detach(|| ExcelReader::try_from(bytes)) .with_context(|| "could not load excel file for those bytes") .into_pyresult() } else { Err(py_errors::InvalidParametersError::new_err( "source must be a string or bytes", )) } } // Taken from pydantic-core: // https://github.com/pydantic/pydantic-core/blob/main/src/lib.rs#L24 #[cfg(feature = "python")] fn get_python_version() -> String { let version = env!("CARGO_PKG_VERSION").to_string(); // cargo uses "1.0-alpha1" etc. while python uses "1.0.0a1", this is not full compatibility, // but it's good enough for now // see https://docs.rs/semver/1.0.9/semver/struct.Version.html#method.parse for rust spec // see https://peps.python.org/pep-0440/ for python spec // it seems the dot after "alpha/beta" e.g. "-alpha.1" is not necessary, hence why this works version.replace("-alpha", "a").replace("-beta", "b") } #[cfg(feature = "python")] #[pymodule(gil_used = false)] fn _fastexcel(m: &Bound<'_, PyModule>) -> PyResult<()> { use crate::types::excelsheet::column_info::{ColumnInfo, ColumnInfoNoDtype}; pyo3_log::init(); let py = m.py(); m.add_function(wrap_pyfunction!(py_read_excel, m)?)?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add("__version__", get_python_version())?; // errors [ ("FastExcelError", py.get_type::()), ( "UnsupportedColumnTypeCombinationError", py.get_type::(), ), ( "CannotRetrieveCellDataError", py.get_type::(), ), ( "CalamineCellError", py.get_type::(), ), ("CalamineError", py.get_type::()), ( "SheetNotFoundError", py.get_type::(), ), ( "ColumnNotFoundError", py.get_type::(), ), ("ArrowError", py.get_type::()), ( "InvalidParametersError", py.get_type::(), ), ] .into_iter() .try_for_each(|(exc_name, exc_type)| m.add(exc_name, exc_type)) } ================================================ FILE: src/types/dtype/mod.rs ================================================ #[cfg(feature = "python")] mod python; use std::{ collections::{HashMap, HashSet}, fmt::{Debug, Display}, str::FromStr, sync::OnceLock, }; use calamine::{CellErrorType, CellType, DataType, Range}; use log::warn; #[cfg(feature = "python")] use pyo3::{IntoPyObject, IntoPyObjectRef}; use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult}; use super::idx_or_name::IdxOrName; /// A column or a cell's data type. #[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)] pub enum DType { Null, Int, Float, String, Bool, DateTime, Date, Duration, } impl FromStr for DType { type Err = FastExcelError; fn from_str(raw_dtype: &str) -> FastExcelResult { match raw_dtype { "null" => Ok(Self::Null), "int" => Ok(Self::Int), "float" => Ok(Self::Float), "string" => Ok(Self::String), "boolean" => Ok(Self::Bool), "datetime" => Ok(Self::DateTime), "date" => Ok(Self::Date), "duration" => Ok(Self::Duration), _ => Err(FastExcelErrorKind::InvalidParameters(format!( "unsupported dtype: \"{raw_dtype}\"" )) .into()), } } } impl Display for DType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match self { DType::Null => "null", DType::Int => "int", DType::Float => "float", DType::String => "string", DType::Bool => "boolean", DType::DateTime => "datetime", DType::Date => "date", DType::Duration => "duration", }) } } pub type DTypeMap = HashMap; /// Provided data types. #[derive(Debug, Clone)] #[cfg_attr(feature = "python", derive(IntoPyObject, IntoPyObjectRef))] pub enum DTypes { /// Coerce all data types to the given type. All(DType), /// Coerce data types based on the provided map. Map(DTypeMap), } impl FromStr for DTypes { type Err = FastExcelError; fn from_str(dtypes: &str) -> FastExcelResult { Ok(DTypes::All(DType::from_str(dtypes)?)) } } /// Whether data types should be coerced or not. #[derive(Debug, Clone, PartialEq, Eq, Hash, Copy, Default)] pub enum DTypeCoercion { /// Coerce data types (default). #[default] Coerce, /// Strictly enforce data types. Strict, } impl FromStr for DTypeCoercion { type Err = FastExcelError; fn from_str(raw_dtype_coercion: &str) -> FastExcelResult { match raw_dtype_coercion { "coerce" => Ok(Self::Coerce), "strict" => Ok(Self::Strict), _ => Err(FastExcelErrorKind::InvalidParameters(format!( "unsupported dtype_coercion: \"{raw_dtype_coercion}\"" )) .into()), } } } /// All the possible string values that should be considered as NULL const NULL_STRING_VALUES: [&str; 19] = [ "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null", ]; fn get_cell_dtype( data: &Range
, row: usize, col: usize, whitespace_as_null: bool, ) -> FastExcelResult { let cell = data .get((row, col)) .ok_or(FastExcelErrorKind::CannotRetrieveCellData(row, col))?; if cell.is_int() { Ok(DType::Int) } else if cell.is_float() { Ok(DType::Float) } else if cell.is_string() { if NULL_STRING_VALUES.contains(&cell.get_string().unwrap()) // If we want to consider whitespace as null and either the cell is empty or contains only // whitespace, we return null || (whitespace_as_null && cell .get_string() .is_none_or(|s| s.trim().is_empty())) { Ok(DType::Null) } else { Ok(DType::String) } } else if cell.is_bool() { Ok(DType::Bool) } else if cell.is_datetime() { // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be // a duration or a datatime let excel_datetime = cell .get_datetime() .expect("calamine indicated that cell is a datetime but get_datetime returned None"); Ok(if excel_datetime.is_datetime() { DType::DateTime } else { DType::Duration }) } // These types contain an ISO8601 representation of a date/datetime or a durat else if cell.is_datetime_iso() { match cell.as_datetime() { // If we cannot convert the cell to a datetime, we're working on a date Some(_) => Ok(DType::DateTime), // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime // rather than a date None => Ok(DType::Date), } } // Simple durations else if cell.is_duration_iso() { Ok(DType::Duration) } // Empty cell else if cell.is_empty() { Ok(DType::Null) } else if cell.is_error() { match cell.get_error() { // considering cells with #N/A! or #REF! as null Some( CellErrorType::NA | CellErrorType::Value | CellErrorType::Null | CellErrorType::Ref | CellErrorType::Num | CellErrorType::Div0, ) => Ok(DType::Null), Some(err) => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()), None => Err(FastExcelErrorKind::Internal(format!( "cell is an error but get_error returned None: {cell:?}" )) .into()), } } else { Err(FastExcelErrorKind::Internal(format!("unsupported cell type: {cell:?}")).into()) } } static FLOAT_TYPES_CELL: OnceLock> = OnceLock::new(); static INT_TYPES_CELL: OnceLock> = OnceLock::new(); static STRING_TYPES_CELL: OnceLock> = OnceLock::new(); fn float_types() -> &'static HashSet { FLOAT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::Bool])) } fn int_types() -> &'static HashSet { INT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Bool])) } fn string_types() -> &'static HashSet { STRING_TYPES_CELL.get_or_init(|| { HashSet::from([ DType::Bool, DType::Int, DType::Float, DType::String, DType::DateTime, DType::Date, ]) }) } pub(crate) fn get_dtype_for_column( data: &Range
, start_row: usize, end_row: usize, col: usize, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult { let mut column_types = (start_row..end_row) .map(|row| get_cell_dtype(data, row, col, whitespace_as_null)) .collect::>>()?; // All columns are nullable anyway so we're not taking Null into account here column_types.remove(&DType::Null); if column_types.is_empty() { // If no type apart from NULL was found, fallback to string except if the column is empty if start_row == end_row { Ok(DType::Null) } else { warn!("Could not determine dtype for column {col}, falling back to string"); Ok(DType::String) } } else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 { // If dtype coercion is strict and we do not have a single dtype, it's an error Err( FastExcelErrorKind::UnsupportedColumnTypeCombination(format!( "type coercion is strict and column contains {column_types:?}" )) .into(), ) } else if column_types.len() == 1 { // If a single non-null type was found, return it Ok(column_types.into_iter().next().unwrap()) } else if column_types.is_subset(int_types()) { // If every cell in the column can be converted to an int, return int64 Ok(DType::Int) } else if column_types.is_subset(float_types()) { // If every cell in the column can be converted to a float, return Float64 Ok(DType::Float) } else if column_types.is_subset(string_types()) { // If every cell in the column can be converted to a string, return Utf8 Ok(DType::String) } else { // NOTE: Not being too smart about multi-types columns for now Err( FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}")) .into(), ) } } /// Convert a float to a nice string to mimic Excel behaviour. /// /// Excel can store a float like 29.02 set by the user as "29.020000000000003" in the XML. /// But in fact, the user will see "29.02" in the cell. /// Excel indeed displays decimal numbers with 8 digits in a standard cell width /// and 10 digits in a wide cell. Like this: /// /// Format = 0.000000000 | Unformatted, wide cell | Unformatted, standard width /// ---------------------|--------------------------|---------------------------- /// 1.123456789 | 1.123456789 | 1.123457 /// 12.123456789 | 12.12345679 | 12.12346 /// ... | ... | ... /// 123456.123456789 | 123456.1235 | 123456.1 /// /// Excel also trims trailing zeros and the decimal point if there is no fractional part. /// /// We do not distinguish between wide cells and standard cells here, so we retain at most /// nine digits after the decimal point and trim any trailing zeros. pub(crate) fn excel_float_to_string(x: f64) -> String { format!("{x:.9}") .trim_end_matches('0') .trim_end_matches('.') .to_string() } #[cfg(feature = "__pyo3-tests")] #[cfg(test)] mod tests { use calamine::{Cell, Data as CalData}; use pretty_assertions::assert_eq; use rstest::{fixture, rstest}; use super::*; #[fixture] fn range() -> Range { Range::from_sparse(vec![ // First column Cell::new((0, 0), CalData::Bool(true)), Cell::new((1, 0), CalData::Bool(false)), Cell::new((2, 0), CalData::String("NULL".to_string())), Cell::new((3, 0), CalData::Int(42)), Cell::new((4, 0), CalData::Float(13.37)), Cell::new((5, 0), CalData::String("hello".to_string())), Cell::new((6, 0), CalData::Empty), Cell::new((7, 0), CalData::String("#N/A".to_string())), Cell::new((8, 0), CalData::Int(12)), Cell::new((9, 0), CalData::Float(12.21)), Cell::new((10, 0), CalData::Bool(true)), Cell::new((11, 0), CalData::Int(1337)), ]) } #[rstest] // pure bool #[case(0, 2, DType::Bool)] // pure int #[case(3, 4, DType::Int)] // pure float #[case(4, 5, DType::Float)] // pure string #[case(5, 6, DType::String)] // pure int + float #[case(3, 5, DType::Float)] // null + int + float #[case(2, 5, DType::Float)] // float + string #[case(4, 6, DType::String)] // int + float + string #[case(3, 6, DType::String)] // null + int + float + string + empty + null #[case(2, 8, DType::String)] // empty + null + int #[case(6, 9, DType::Int)] // int + float + null #[case(7, 10, DType::Float)] // int + float + bool + null #[case(7, 11, DType::Float)] // int + bool #[case(10, 12, DType::Int)] fn get_arrow_column_type_multi_dtype_ok_coerce( range: Range, #[case] start_row: usize, #[case] end_row: usize, #[case] expected: DType, ) { assert_eq!( get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Coerce, false) .unwrap(), expected ); } #[rstest] // pure bool #[case(0, 2, DType::Bool)] // pure int #[case(3, 4, DType::Int)] // pure float #[case(4, 5, DType::Float)] // pure string #[case(5, 6, DType::String)] // empty + null + int #[case(6, 9, DType::Int)] fn get_arrow_column_type_multi_dtype_ok_strict( range: Range, #[case] start_row: usize, #[case] end_row: usize, #[case] expected: DType, ) { assert_eq!( get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false) .unwrap(), expected ); } #[rstest] // pure int + float #[case(3, 5)] // float + string #[case(4, 6)] // int + float + string #[case(3, 6)] // null + int + float + string + empty + null #[case(2, 8)] // int + float + null #[case(7, 10)] // int + float + bool + null #[case(7, 11)] // int + bool #[case(10, 12)] fn get_arrow_column_type_multi_dtype_ko_strict( range: Range, #[case] start_row: usize, #[case] end_row: usize, ) { let result = get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false); assert!(matches!( result.unwrap_err().kind, FastExcelErrorKind::UnsupportedColumnTypeCombination(_) )); } #[rstest] #[case(29.020000000000003, "29.02")] #[case(10000_f64, "10000")] #[case(23.0, "23")] fn test_excel_float_to_string(#[case] x: f64, #[case] expected: &str) { assert_eq!(excel_float_to_string(x), expected.to_string()); } } ================================================ FILE: src/types/dtype/python.rs ================================================ use arrow_schema::{DataType as ArrowDataType, TimeUnit}; use pyo3::{Borrowed, Bound, FromPyObject, IntoPyObject, PyAny, PyErr, Python, types::PyString}; use crate::{ error::{FastExcelErrorKind, py_errors::IntoPyResult}, types::dtype::{DType, DTypeCoercion, DTypeMap, DTypes}, }; impl<'py> IntoPyObject<'py> for DType { type Target = PyString; type Output = Bound<'py, Self::Target>; type Error = std::convert::Infallible; fn into_pyobject(self, py: Python<'py>) -> Result { self.to_string().into_pyobject(py) } } impl<'py> IntoPyObject<'py> for &DType { type Target = PyString; type Output = Bound<'py, Self::Target>; type Error = std::convert::Infallible; fn into_pyobject(self, py: Python<'py>) -> Result { self.to_string().into_pyobject(py) } } impl<'a, 'py> FromPyObject<'a, 'py> for DType { type Error = PyErr; fn extract(py_dtype: Borrowed<'a, 'py, PyAny>) -> Result { if let Ok(dtype_pystr) = py_dtype.extract::() { dtype_pystr.parse() } else { Err(FastExcelErrorKind::InvalidParameters(format!( "{py_dtype:?} cannot be converted to str" )) .into()) } .into_pyresult() } } impl<'a, 'py> FromPyObject<'a, 'py> for DTypes { type Error = PyErr; fn extract(py_dtypes: Borrowed<'a, 'py, PyAny>) -> Result { if let Ok(py_dtypes_str) = py_dtypes.extract::() { py_dtypes_str.parse() } else { Ok(DTypes::Map(py_dtypes.extract::()?)) } .into_pyresult() } } impl From<&DType> for ArrowDataType { fn from(dtype: &DType) -> Self { match dtype { DType::Null => ArrowDataType::Null, DType::Int => ArrowDataType::Int64, DType::Float => ArrowDataType::Float64, DType::String => ArrowDataType::Utf8, DType::Bool => ArrowDataType::Boolean, DType::DateTime => ArrowDataType::Timestamp(TimeUnit::Millisecond, None), DType::Date => ArrowDataType::Date32, DType::Duration => ArrowDataType::Duration(TimeUnit::Millisecond), } } } impl<'a, 'py> FromPyObject<'a, 'py> for DTypeCoercion { type Error = PyErr; fn extract(py_dtype_coercion: Borrowed<'a, 'py, PyAny>) -> Result { if let Ok(dtype_coercion_pystr) = py_dtype_coercion.extract::() { dtype_coercion_pystr.parse() } else { Err(FastExcelErrorKind::InvalidParameters(format!( "{py_dtype_coercion:?} cannot be converted to str" )) .into()) } .into_pyresult() } } ================================================ FILE: src/types/excelreader/mod.rs ================================================ #[cfg(feature = "python")] mod python; use std::{ fs::File, io::{BufReader, Cursor}, }; use calamine::{ Data, HeaderRow, Range, Reader, Sheet as CalamineSheet, Sheets, Table, open_workbook_auto, open_workbook_auto_from_rs, }; #[cfg(feature = "python")] use calamine::{DataRef, ReaderRef}; #[cfg(feature = "python")] use pyo3::pyclass; use crate::{ ExcelSheet, ExcelTable, error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult}, types::{ dtype::{DTypeCoercion, DTypes}, excelsheet::{SelectedColumns, SkipRows}, idx_or_name::IdxOrName, }, }; use super::excelsheet::table::{extract_table_names, extract_table_range}; enum ExcelSheets { File(Sheets>), Bytes(Sheets>>), } impl ExcelSheets { fn worksheet_range(&mut self, name: &str) -> FastExcelResult> { match self { Self::File(sheets) => sheets.worksheet_range(name), Self::Bytes(sheets) => sheets.worksheet_range(name), } .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) .with_context(|| format!("Error while loading sheet {name}")) } #[allow(dead_code)] fn sheet_metadata(&self) -> &[CalamineSheet] { match self { ExcelSheets::File(sheets) => sheets.sheets_metadata(), ExcelSheets::Bytes(sheets) => sheets.sheets_metadata(), } } fn table_names(&mut self, sheet_name: Option<&str>) -> FastExcelResult> { let names = match self { Self::File(sheets) => extract_table_names(sheets, sheet_name), Self::Bytes(sheets) => extract_table_names(sheets, sheet_name), }?; Ok(names.into_iter().map(String::as_str).collect()) } fn defined_names(&mut self) -> FastExcelResult> { let defined_names = match self { Self::File(sheets) => sheets.defined_names(), Self::Bytes(sheets) => sheets.defined_names(), } .to_vec() .into_iter() .map(|(name, formula)| DefinedName { name, formula }) .collect(); Ok(defined_names) } #[cfg(feature = "python")] fn supports_by_ref(&self) -> bool { matches!( self, Self::File(Sheets::Xlsx(_)) | Self::Bytes(Sheets::Xlsx(_)) ) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { match self { Self::File(sheets) => { sheets.with_header_row(header_row); self } Self::Bytes(sheets) => { sheets.with_header_row(header_row); self } } } #[cfg(feature = "python")] fn worksheet_range_ref(&mut self, name: &str) -> FastExcelResult>> { match self { ExcelSheets::File(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?), ExcelSheets::Bytes(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?), _ => Err(FastExcelErrorKind::Internal( "sheets do not support worksheet_range_ref".to_string(), ) .into()), } .with_context(|| format!("Error while loading sheet {name}")) } fn get_table(&mut self, name: &str) -> FastExcelResult> { match self { Self::File(sheets) => extract_table_range(name, sheets), Self::Bytes(sheets) => extract_table_range(name, sheets), } } } #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "python", pyclass(name = "DefinedName", skip_from_py_object))] pub struct DefinedName { pub name: String, pub formula: String, } /// Options for loading a sheet or table. #[non_exhaustive] #[derive(Debug)] pub struct LoadSheetOrTableOptions { /// The index of the row containing the column labels. If `None`, the provided headers are used. /// Any row before the header row is skipped. pub header_row: Option, /// The column names to use. If `None`, the column names are inferred from the header row. pub column_names: Option>, /// How rows should be skipped. pub skip_rows: SkipRows, /// The number of rows to read. If `None`, all rows are read. pub n_rows: Option, /// The number of rows to sample for schema inference. If `None`, all rows are sampled. pub schema_sample_rows: Option, /// How data types should be coerced. pub dtype_coercion: DTypeCoercion, /// The columns to select. pub selected_columns: SelectedColumns, /// Override the inferred data types. pub dtypes: Option, /// Skip rows at the end of the sheet/table containing only whitespace and null values. pub skip_whitespace_tail_rows: bool, /// Consider cells containing only whitespace as null values. pub whitespace_as_null: bool, } impl LoadSheetOrTableOptions { /// Returns a `calamine::HeaderRow`, indicating the first row of the range to be read. For us, /// `header_row` can be `None` (meaning there is no header and we should start reading the data /// at the beginning of the sheet) fn calamine_header_row(&self) -> HeaderRow { match (self.header_row, &self.skip_rows) { (None | Some(0), SkipRows::SkipEmptyRowsAtBeginning) => HeaderRow::FirstNonEmptyRow, (None, _) => HeaderRow::Row(0), (Some(row), _) => HeaderRow::Row(row as u32), } } /// Returns the row number of the first data row to read, if defined pub(crate) fn data_header_row(&self) -> Option { self.header_row.and(Some(0)) } /// Returns a new `LoadSheetOrTableOptions` instance for loading a sheet. `header_row` is set to /// `Some(0)` pub fn new_for_sheet() -> Self { Self { header_row: Some(0), column_names: Default::default(), skip_rows: Default::default(), n_rows: Default::default(), schema_sample_rows: Default::default(), dtype_coercion: Default::default(), selected_columns: Default::default(), dtypes: Default::default(), skip_whitespace_tail_rows: Default::default(), whitespace_as_null: Default::default(), } } /// Returns a new `LoadSheetOrTableOptions` instance for loading a sheet. `header_row` is set to /// `None` pub fn new_for_table() -> Self { Self { header_row: None, column_names: Default::default(), skip_rows: Default::default(), n_rows: Default::default(), schema_sample_rows: Default::default(), dtype_coercion: Default::default(), selected_columns: Default::default(), dtypes: Default::default(), skip_whitespace_tail_rows: Default::default(), whitespace_as_null: Default::default(), } } pub fn header_row(mut self, header_row: usize) -> Self { self.header_row = Some(header_row); self } pub fn no_header_row(mut self) -> Self { self.header_row = None; self } pub fn column_names>>( mut self, column_names: I, ) -> Self { self.column_names = Some(column_names.into_iter().map(Into::into).collect()); self } pub fn skip_rows(mut self, skip_rows: SkipRows) -> Self { self.skip_rows = skip_rows; self } pub fn n_rows(mut self, n_rows: usize) -> Self { self.n_rows = Some(n_rows); self } pub fn schema_sample_rows(mut self, schema_sample_rows: usize) -> Self { self.schema_sample_rows = Some(schema_sample_rows); self } pub fn dtype_coercion(mut self, dtype_coercion: DTypeCoercion) -> Self { self.dtype_coercion = dtype_coercion; self } pub fn selected_columns(mut self, selected_columns: SelectedColumns) -> Self { self.selected_columns = selected_columns; self } pub fn with_dtypes(mut self, dtypes: DTypes) -> Self { self.dtypes = Some(dtypes); self } pub fn skip_whitespace_tail_rows(mut self, skip_whitespace_tail_rows: bool) -> Self { self.skip_whitespace_tail_rows = skip_whitespace_tail_rows; self } pub fn whitespace_as_null(mut self, whitespace_as_null: bool) -> Self { self.whitespace_as_null = whitespace_as_null; self } } /// Represents an open Excel file and allows to access its sheets and tables. #[cfg_attr(feature = "python", pyclass(name = "_ExcelReader"))] pub struct ExcelReader { sheets: ExcelSheets, sheet_metadata: Vec, #[cfg(feature = "python")] source: String, } impl ExcelReader { // NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed // string, but rather from the file pointed by it. Semantically, try_from_path is clearer pub(crate) fn try_from_path(path: &str) -> FastExcelResult { let sheets = open_workbook_auto(path) .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) .with_context(|| format!("Could not open workbook at {path}"))?; let sheet_metadata = sheets.sheets_metadata().to_owned(); Ok(Self { sheets: ExcelSheets::File(sheets), sheet_metadata, #[cfg(feature = "python")] source: path.to_owned(), }) } fn find_sheet_meta(&self, idx_or_name: IdxOrName) -> FastExcelResult<&CalamineSheet> { match idx_or_name { IdxOrName::Name(name) => { if let Some(sheet) = self.sheet_metadata.iter().find(|s| s.name == name) { Ok(sheet) } else { Err(FastExcelErrorKind::SheetNotFound(IdxOrName::Name(name.clone())).into()).with_context(|| { let available_sheets = self.sheet_metadata.iter().map(|s| format!("\"{}\"", s.name)).collect::>().join(", "); format!( "Sheet \"{name}\" not found in file. Available sheets: {available_sheets}." ) }) } } IdxOrName::Idx(idx) => self .sheet_metadata .get(idx) .ok_or_else(|| FastExcelErrorKind::SheetNotFound(IdxOrName::Idx(idx)).into()) .with_context(|| { format!( "Sheet index {idx} is out of range. File has {} sheets.", self.sheet_metadata.len() ) }), } } /// Load a sheet from the Excel file. pub fn load_sheet( &mut self, idx_or_name: IdxOrName, opts: LoadSheetOrTableOptions, ) -> FastExcelResult { let calamine_header_row = opts.calamine_header_row(); let sheet_meta = self.find_sheet_meta(idx_or_name)?.to_owned(); let range = self .sheets .with_header_row(calamine_header_row) .worksheet_range(&sheet_meta.name)?; ExcelSheet::try_new(sheet_meta, range.into(), opts) } /// Load a table from the Excel file. pub fn load_table( &mut self, name: &str, opts: LoadSheetOrTableOptions, ) -> FastExcelResult { let table = self.sheets.get_table(name)?; ExcelTable::try_new(table, opts) } pub fn sheet_names(&self) -> Vec<&str> { self.sheet_metadata .iter() .map(|s| s.name.as_str()) .collect() } pub fn table_names(&mut self, sheet_name: Option<&str>) -> FastExcelResult> { self.sheets.table_names(sheet_name) } pub fn defined_names(&mut self) -> FastExcelResult> { self.sheets.defined_names() } } impl TryFrom<&[u8]> for ExcelReader { type Error = FastExcelError; fn try_from(bytes: &[u8]) -> Result { let cursor = Cursor::new(bytes.to_vec()); let sheets = open_workbook_auto_from_rs(cursor) .map_err(|err| FastExcelErrorKind::CalamineError(err).into()) .with_context(|| "Could not open workbook from bytes")?; let sheet_metadata = sheets.sheets_metadata().to_owned(); Ok(Self { sheets: ExcelSheets::Bytes(sheets), sheet_metadata, #[cfg(feature = "python")] source: "bytes".to_owned(), }) } } ================================================ FILE: src/types/excelreader/python.rs ================================================ use arrow_array::RecordBatch; use pyo3::{Bound, IntoPyObjectExt, PyAny, PyResult, Python, pymethods, types::PyString}; use super::{DefinedName, ExcelReader}; use crate::{ ExcelSheet, data::{ExcelSheetData, record_batch_from_data_and_columns}, error::{ErrorContext, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult}, types::{ dtype::{DTypeCoercion, DTypes}, excelreader::LoadSheetOrTableOptions, excelsheet::{ Header, Pagination, SelectedColumns, SkipRows, column_info::{build_available_columns_info, finalize_column_info}, }, idx_or_name::IdxOrName, }, utils::schema::get_schema_sample_rows, }; impl ExcelReader { fn build_selected_columns( use_columns: Option<&Bound<'_, PyAny>>, ) -> FastExcelResult { use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}")) } fn load_sheet_eager( data: &ExcelSheetData, opts: LoadSheetOrTableOptions, ) -> FastExcelResult { let data_header_row = opts.data_header_row(); let pagination = match &data { ExcelSheetData::Owned(range) => { Pagination::try_new(opts.skip_rows, opts.n_rows, range)? } ExcelSheetData::Ref(range) => Pagination::try_new(opts.skip_rows, opts.n_rows, range)?, }; let header = Header::new(data_header_row, opts.column_names); let offset = header.offset() + pagination.offset(); let limit = { let upper_bound = data.height(); if let Some(n_rows) = pagination.n_rows() { // minimum value between (offset+n_rows) and the data's height std::cmp::min(offset + n_rows, upper_bound) } else { upper_bound } }; let sample_rows_limit = get_schema_sample_rows(opts.schema_sample_rows, offset, limit); let available_columns_info = build_available_columns_info(data, &opts.selected_columns, &header)?; let final_columns_info = opts .selected_columns .select_columns(available_columns_info)?; let available_columns = finalize_column_info( final_columns_info, data, offset, sample_rows_limit, opts.dtypes.as_ref(), &opts.dtype_coercion, opts.whitespace_as_null, )?; match data { ExcelSheetData::Owned(data) => record_batch_from_data_and_columns( &available_columns, data, offset, limit, opts.whitespace_as_null, ), ExcelSheetData::Ref(data) => record_batch_from_data_and_columns( &available_columns, data, offset, limit, opts.whitespace_as_null, ), } } fn build_sheet<'py>( &mut self, idx_or_name: IdxOrName, opts: LoadSheetOrTableOptions, eager: bool, py: Python<'py>, ) -> PyResult> { let calamine_header_row = opts.calamine_header_row(); let sheet_meta = self .find_sheet_meta(idx_or_name) .into_pyresult()? .to_owned(); if eager && self.sheets.supports_by_ref() { let range = py .detach(|| { self.sheets .with_header_row(calamine_header_row) .worksheet_range_ref(&sheet_meta.name) }) .into_pyresult()?; let rb = py .detach(|| Self::load_sheet_eager(&range.into(), opts)) .into_pyresult()?; #[cfg(feature = "pyarrow")] { use arrow_pyarrow::ToPyArrow; rb.to_pyarrow(py) } #[cfg(not(feature = "pyarrow"))] { Err(pyo3::exceptions::PyRuntimeError::new_err( "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.", )) } } else { let range = py .detach(|| { self.sheets .with_header_row(calamine_header_row) .worksheet_range(&sheet_meta.name) }) .into_pyresult()?; let sheet = ExcelSheet::try_new(sheet_meta, range.into(), opts).into_pyresult()?; if eager { #[cfg(feature = "pyarrow")] { sheet.to_arrow(py) } #[cfg(not(feature = "pyarrow"))] { Err(pyo3::exceptions::PyRuntimeError::new_err( "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.", )) } } else { sheet.into_bound_py_any(py) } } } #[allow(clippy::too_many_arguments)] fn build_table<'py>( &mut self, name: &str, opts: LoadSheetOrTableOptions, eager: bool, py: Python<'py>, ) -> PyResult> { let excel_table = py.detach(|| self.load_table(name, opts)).into_pyresult()?; if eager { #[cfg(feature = "pyarrow")] { Ok(excel_table.to_arrow(py)?) } #[cfg(not(feature = "pyarrow"))] { Err(pyo3::exceptions::PyRuntimeError::new_err( "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.", )) } } else { excel_table.into_bound_py_any(py) } } } #[pymethods] impl ExcelReader { pub fn __repr__(&self) -> String { format!("ExcelReader<{}>", &self.source) } #[pyo3(name = "table_names", signature = (sheet_name = None))] pub(crate) fn py_table_names(&mut self, sheet_name: Option<&str>) -> PyResult> { self.sheets.table_names(sheet_name).into_pyresult() } #[pyo3(name = "defined_names")] pub(crate) fn py_defined_names(&mut self) -> PyResult> { self.defined_names().into_pyresult() } #[pyo3(name = "load_sheet", signature = ( idx_or_name, *, header_row = 0, column_names = None, skip_rows = SkipRows::SkipEmptyRowsAtBeginning, n_rows = None, schema_sample_rows = 1_000, dtype_coercion = DTypeCoercion::Coerce, use_columns = None, dtypes = None, eager = false, skip_whitespace_tail_rows = false, whitespace_as_null = false, ))] #[allow(clippy::too_many_arguments)] pub(crate) fn py_load_sheet<'py>( &mut self, idx_or_name: &Bound<'py, PyAny>, header_row: Option, column_names: Option>, skip_rows: SkipRows, n_rows: Option, schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'py, PyAny>>, dtypes: Option, eager: bool, skip_whitespace_tail_rows: bool, whitespace_as_null: bool, py: Python<'py>, ) -> PyResult> { // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3 if let Some(0) = schema_sample_rows { return Err(FastExcelErrorKind::InvalidParameters( "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(), ) .into()) .into_pyresult(); } let idx_or_name = idx_or_name.try_into().into_pyresult()?; let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?; let opts = LoadSheetOrTableOptions { header_row, column_names, skip_rows, n_rows, schema_sample_rows, dtype_coercion, selected_columns, dtypes, skip_whitespace_tail_rows, whitespace_as_null, }; self.build_sheet(idx_or_name, opts, eager, py) } #[pyo3(name = "load_table", signature = ( name, *, header_row = 0, column_names = None, skip_rows = SkipRows::SkipEmptyRowsAtBeginning, n_rows = None, schema_sample_rows = 1_000, dtype_coercion = DTypeCoercion::Coerce, use_columns = None, dtypes = None, eager = false, skip_whitespace_tail_rows = false, whitespace_as_null = false, ))] #[allow(clippy::too_many_arguments)] pub(crate) fn py_load_table<'py>( &mut self, name: &Bound<'py, PyString>, header_row: Option, column_names: Option>, skip_rows: SkipRows, n_rows: Option, schema_sample_rows: Option, dtype_coercion: DTypeCoercion, use_columns: Option<&Bound<'py, PyAny>>, dtypes: Option, eager: bool, skip_whitespace_tail_rows: bool, whitespace_as_null: bool, py: Python<'py>, ) -> PyResult> { // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3 if let Some(0) = schema_sample_rows { return Err(FastExcelErrorKind::InvalidParameters( "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(), ) .into()) .into_pyresult(); } let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?; let opts = LoadSheetOrTableOptions { header_row, column_names, skip_rows, n_rows, schema_sample_rows, dtype_coercion, selected_columns, dtypes, skip_whitespace_tail_rows, whitespace_as_null, }; self.build_table(&name.to_string(), opts, eager, py) } #[getter("sheet_names")] pub(crate) fn py_sheet_names(&self) -> Vec<&str> { self.sheet_names() } } #[pymethods] impl DefinedName { /// Creates a new `DefinedName` object. #[new] pub fn py_new(name: String, formula: String) -> Self { DefinedName { name, formula } } #[getter("name")] pub fn py_name(&self) -> &str { &self.name } #[getter("formula")] pub fn py_formula(&self) -> &str { &self.formula } pub fn __repr__(&self) -> String { format!( "DefinedName<{name} ({formula})>", name = &self.name, formula = self .formula .get(..10) .map(|s| format!("{}...", s)) .as_deref() .unwrap_or(self.formula.as_str()) ) } pub fn __eq__(&self, other: &Self) -> bool { self == other } } ================================================ FILE: src/types/excelsheet/column_info/mod.rs ================================================ #[cfg(feature = "python")] mod python; use std::{fmt::Display, str::FromStr}; use calamine::DataType; #[cfg(feature = "python")] use pyo3::pyclass; use crate::{ data::ExcelSheetData, error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult}, types::{ dtype::{DType, DTypeCoercion, DTypes, get_dtype_for_column}, idx_or_name::IdxOrName, }, }; use super::{Header, SelectedColumns}; /// How the column name was determined #[derive(Debug, Clone, PartialEq)] pub enum ColumnNameFrom { /// The column name was provided by the user. Provided, /// The column name was looked up in the sheet or table. LookedUp, /// The column name was generated based on the column index. Generated, } impl FromStr for ColumnNameFrom { type Err = FastExcelError; fn from_str(s: &str) -> FastExcelResult { match s { "provided" => Ok(Self::Provided), "looked_up" => Ok(Self::LookedUp), "generated" => Ok(Self::Generated), _ => Err( FastExcelErrorKind::InvalidParameters(format!("invalid ColumnNameFrom: {s}")) .into(), ), } } } impl Display for ColumnNameFrom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match self { ColumnNameFrom::Provided => "provided", ColumnNameFrom::LookedUp => "looked_up", ColumnNameFrom::Generated => "generated", }) } } /// How the data type was determined. #[derive(Debug, Clone, PartialEq)] pub enum DTypeFrom { /// The data type was provided for all columns. ProvidedForAll, /// The data type was provided via the column's index. ProvidedByIndex, /// The data type was provided via the column's name. ProvidedByName, /// The data type was guessed based on the column's data. Guessed, } impl Display for DTypeFrom { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(match self { DTypeFrom::ProvidedForAll => "provided_for_all", DTypeFrom::ProvidedByIndex => "provided_by_index", DTypeFrom::ProvidedByName => "provided_by_name", DTypeFrom::Guessed => "guessed", }) } } impl FromStr for DTypeFrom { type Err = FastExcelError; fn from_str(s: &str) -> FastExcelResult { match s { "provided_for_all" => Ok(Self::ProvidedForAll), "provided_by_index" => Ok(Self::ProvidedByIndex), "provided_by_name" => Ok(Self::ProvidedByName), "guessed" => Ok(Self::Guessed), _ => Err( FastExcelErrorKind::InvalidParameters(format!("invalid DTypesFrom: {s}")).into(), ), } } } // NOTE: The types for properties unfortunately do not appear in the docs for this class, so we had // to specify them via docstrings /// Metadata about a single column in a sheet. #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "python", pyclass(name = "ColumnInfo", skip_from_py_object))] pub struct ColumnInfo { /// The column's name pub name: String, /// The column's index pub index: usize, /// The column's absolute index pub absolute_index: usize, /// The column's data type pub dtype: DType, /// How the column name was determined pub column_name_from: ColumnNameFrom, /// How the column data type was determined pub dtype_from: DTypeFrom, } impl ColumnInfo { pub(crate) fn new( name: String, index: usize, absolute_index: usize, column_name_from: ColumnNameFrom, dtype: DType, dtype_from: DTypeFrom, ) -> Self { Self { name, index, absolute_index, dtype, column_name_from, dtype_from, } } } /// This class provides information about a single column in a sheet, without associated type /// information #[derive(Debug, Clone, PartialEq)] #[cfg_attr( feature = "python", pyclass(name = "ColumnInfoNoDtype", skip_from_py_object) )] pub(crate) struct ColumnInfoNoDtype { name: String, index: usize, absolute_index: usize, column_name_from: ColumnNameFrom, } // Allows us to easily compare ourselves to a column index or name impl PartialEq for ColumnInfoNoDtype { fn eq(&self, other: &IdxOrName) -> bool { match other { IdxOrName::Idx(index) => index == &self.index, IdxOrName::Name(name) => name == &self.name, } } } impl ColumnInfoNoDtype { pub(super) fn new( name: String, index: usize, absolute_index: usize, column_name_from: ColumnNameFrom, ) -> Self { Self { name, index, absolute_index, column_name_from, } } pub(super) fn with_name(mut self, name: String) -> Self { self.name = name; self } pub(super) fn name(&self) -> &str { &self.name } pub(super) fn absolute_index(&self) -> usize { self.absolute_index } fn dtype_info( &self, data: &D, start_row: usize, end_row: usize, specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult<(DType, DTypeFrom)> { specified_dtypes .and_then(|dtypes| { match dtypes { DTypes::All(dtype) => Some((*dtype, DTypeFrom::ProvidedForAll)), DTypes::Map(dtypes) => { // if we have dtypes, look the dtype up by index, and fall back on a lookup by name // (done in this order because copying an usize is cheaper than cloning a string) if let Some(dtype) = dtypes.get(&self.absolute_index().into()) { Some((*dtype, DTypeFrom::ProvidedByIndex)) } else { dtypes .get(&self.name.clone().into()) .map(|dtype| (*dtype, DTypeFrom::ProvidedByName)) } } } }) .map(FastExcelResult::Ok) // If we could not look up a dtype, guess it from the data .unwrap_or_else(|| { data.dtype_for_column( start_row, end_row, self.index, dtype_coercion, whitespace_as_null, ) .map(|dtype| (dtype, DTypeFrom::Guessed)) }) } pub(super) fn finish( self, data: &D, start_row: usize, end_row: usize, specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult { let (dtype, dtype_from) = self .dtype_info( data, start_row, end_row, specified_dtypes, dtype_coercion, whitespace_as_null, ) .with_context(|| format!("could not determine dtype for column {}", self.name))?; Ok(ColumnInfo::new( self.name, self.index, self.absolute_index, self.column_name_from, dtype, dtype_from, )) } } pub(crate) trait CalamineDataProvider { fn width(&self) -> usize; fn get_as_string(&self, pos: (usize, usize)) -> Option; fn dtype_for_column( &self, start_row: usize, end_row: usize, col: usize, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult; fn start(&self) -> Option<(usize, usize)>; } impl CalamineDataProvider for ExcelSheetData<'_> { fn width(&self) -> usize { self.width() } fn get_as_string(&self, pos: (usize, usize)) -> Option { self.get_as_string(pos) } fn dtype_for_column( &self, start_row: usize, end_row: usize, col: usize, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult { self.dtype_for_column(start_row, end_row, col, dtype_coercion, whitespace_as_null) } fn start(&self) -> Option<(usize, usize)> { self.start() } } impl CalamineDataProvider for calamine::Range { fn width(&self) -> usize { self.width() } fn get_as_string(&self, pos: (usize, usize)) -> Option { self.get(pos).and_then(|data| data.as_string()) } fn dtype_for_column( &self, start_row: usize, end_row: usize, col: usize, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult { get_dtype_for_column( self, start_row, end_row, col, dtype_coercion, whitespace_as_null, ) } fn start(&self) -> Option<(usize, usize)> { self.start().map(|(r, c)| (r as usize, c as usize)) } } fn column_info_from_header( data: &D, selected_columns: &SelectedColumns, header: &Header, ) -> FastExcelResult> { let width = data.width(); let (_, col_off) = data.start().unwrap_or((0, 0)); match header { Header::None => Ok((0..width) .map(|col_idx| { ColumnInfoNoDtype::new( format!("__UNNAMED__{col_idx}"), col_idx, col_off + col_idx, ColumnNameFrom::Generated, ) }) .collect()), Header::At(row_idx) => Ok((0..width) .map(|col_idx| { data.get_as_string((*row_idx, col_idx)) .map(|col_name| { // Remove null bytes from column names to avoid CString panics in Arrow FFI. // // Excel strings (especially UTF-16 in .xls) may contain embedded nulls (`\0`) after // conversion to Rust `String`. Arrow’s C FFI uses `CString::new()`, which fails on // null bytes, causing panics. // // This strips nulls while keeping the readable content. let sanitized_col_name = col_name.replace('\0', ""); ColumnInfoNoDtype::new( sanitized_col_name, col_idx, col_off + col_idx, ColumnNameFrom::LookedUp, ) }) .unwrap_or_else(|| { ColumnInfoNoDtype::new( format!("__UNNAMED__{col_idx}"), col_idx, col_off + col_idx, ColumnNameFrom::Generated, ) }) }) .collect()), Header::With(names) => { if let SelectedColumns::Selection(column_selection) = selected_columns { if column_selection.len() != names.len() { return Err(FastExcelErrorKind::InvalidParameters( "column_names and use_columns must have the same length when a header is provided".to_string(), ) .into()); } let selected_indices = column_selection .iter() .map(|idx_or_name| { match idx_or_name { IdxOrName::Idx(idx) => Ok(*idx), IdxOrName::Name(name) => Err(FastExcelErrorKind::InvalidParameters( format!("use_columns can only contain integers when used with columns_names, got \"{name}\"") ) .into()), } }) .collect::>>()?; Ok((0..width) .map(|col_idx| { let absolute_col_idx = col_idx + col_off; let provided_name_opt = if let Some(pos_in_names) = selected_indices .iter() .position(|idx| *idx == absolute_col_idx) { names.get(pos_in_names).cloned() } else { None }; match provided_name_opt { Some(provided_name) => ColumnInfoNoDtype::new( provided_name, col_idx, col_off + col_idx, ColumnNameFrom::Provided, ), None => ColumnInfoNoDtype::new( format!("__UNNAMED__{col_idx}"), col_idx, col_off + col_idx, ColumnNameFrom::Generated, ), } }) .collect()) } else { let nameless_start_idx = names.len(); Ok(names .iter() .enumerate() .map(|(col_idx, name)| { ColumnInfoNoDtype::new( name.to_owned(), col_idx, col_off + col_idx, ColumnNameFrom::Provided, ) }) .chain((nameless_start_idx..width).map(|col_idx| { ColumnInfoNoDtype::new( format!("__UNNAMED__{col_idx}"), col_idx, col_off + col_idx, ColumnNameFrom::Generated, ) })) .collect()) } } } } /// Loads available columns and sets aliases in case of name conflicts pub(crate) fn build_available_columns_info( data: &D, selected_columns: &SelectedColumns, header: &Header, ) -> FastExcelResult> { column_info_from_header(data, selected_columns, header).map(set_aliases_for_columns_info) } fn set_aliases_for_columns_info(columns_info: Vec) -> Vec { let mut aliased_column_names = Vec::with_capacity(columns_info.len()); columns_info .into_iter() .map(|mut column_info_builder| { // Setting the right alias for every column let alias = alias_for_name(column_info_builder.name(), &aliased_column_names); if alias != column_info_builder.name() { column_info_builder = column_info_builder.with_name(alias.clone()); } aliased_column_names.push(alias); column_info_builder }) .collect() } fn alias_for_name(name: &str, existing_names: &[String]) -> String { #[inline] fn rec(name: &str, existing_names: &[String], depth: usize) -> String { let alias = if depth == 0 { name.to_owned() } else { format!("{name}_{depth}") }; match existing_names .iter() .any(|existing_name| existing_name == &alias) { true => rec(name, existing_names, depth + 1), false => alias, } } rec(name, existing_names, 0) } /// Turns `ColumnInfoNoDtype` into `ColumnInfo`. This will determine the right dtype when needed pub(crate) fn finalize_column_info( available_columns_info: Vec, data: &D, start_row: usize, end_row: usize, specified_dtypes: Option<&DTypes>, dtype_coercion: &DTypeCoercion, whitespace_as_null: bool, ) -> FastExcelResult> { available_columns_info .into_iter() .map(|column_info_builder| { column_info_builder.finish( data, start_row, end_row, specified_dtypes, dtype_coercion, whitespace_as_null, ) }) .collect() } #[derive(Debug)] pub(crate) enum AvailableColumns { Pending, Loaded(Vec), } impl AvailableColumns { pub(crate) fn as_loaded(&self) -> FastExcelResult<&[ColumnInfo]> { match self { AvailableColumns::Loaded(column_infos) => Ok(column_infos), AvailableColumns::Pending => Err(FastExcelErrorKind::Internal(format!( "Expected available columns to be loaded, got {self:?}. \ This is a bug, please report it to the fastexcel repository" )) .into()), } } } ================================================ FILE: src/types/excelsheet/column_info/python.rs ================================================ use arrow_schema::Field; use pyo3::{PyResult, pymethods}; use crate::{ error::py_errors::IntoPyResult, types::excelsheet::column_info::{ColumnInfo, ColumnInfoNoDtype}, }; impl From<&ColumnInfo> for Field { fn from(col_info: &ColumnInfo) -> Self { Field::new(&col_info.name, (&col_info.dtype).into(), true) } } #[pymethods] impl ColumnInfo { /// Creates a new ColumnInfo object. /// /// - `name`: `str`. The name of the column /// - `index`: `int`. The index of the column. Must be >=0 /// - `absolute_index`: `int`. The absolute index of the column. Must be >=0 /// - `column_name_from`: `fastexcel.ColumnNameFrom`. The origin of the column name /// - `dtype`: `fastexcel.DType`. The dtype of the column /// - `dtype_from`: `fastexcel.DTypeFrom`. The origin of the dtype for the column #[new] pub(crate) fn py_new( name: String, index: usize, absolute_index: usize, column_name_from: &str, dtype: &str, dtype_from: &str, ) -> PyResult { Ok(Self::new( name, index, absolute_index, column_name_from.parse().into_pyresult()?, dtype.parse().into_pyresult()?, dtype_from.parse().into_pyresult()?, )) } /// `fastexcel.DType`. The dtype of the column #[getter(dtype)] fn get_dtype(&self) -> String { self.dtype.to_string() } #[getter("name")] /// `str`. The name of the column pub fn py_name(&self) -> &str { &self.name } #[getter("index")] /// `int`. The index of the column pub fn py_index(&self) -> usize { self.index } #[getter("absolute_index")] /// `int`. The absolute index of the column pub fn py_absolute_index(&self) -> usize { self.absolute_index } /// `fastexcel.ColumnNameFrom`. How the name of the column was determined. /// /// One of three possible values: /// - `"provided"`: The column name was provided via the `use_columns` parameter /// - `"looked_up"`: The column name was looked up from the data found in the sheet /// - `"generated"`: The column name was generated from the column index, either because /// `header_row` was `None`, or because it could not be looked up #[getter(column_name_from)] fn get_colum_name_from(&self) -> String { self.column_name_from.to_string() } /// `fastexcel.DTypeFrom`. How the dtype of the column was determined. /// /// One of three possible values: /// - `"provided_by_index"`: The dtype was specified via the column index /// - `"provided_by_name"`: The dtype was specified via the column name /// - `"guessed"`: The dtype was determined from the content of the column #[getter(dtype_from)] fn get_dtype_from(&self) -> String { self.dtype_from.to_string() } pub fn __repr__(&self) -> String { format!( "ColumnInfo(name=\"{name}\", index={index}, absolute_index={absolute_index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" )", name = self.name, index = self.index, absolute_index = self.absolute_index, dtype = self.dtype, dtype_from = self.dtype_from, column_name_from = self.column_name_from ) } pub fn __eq__(&self, other: &Self) -> bool { self == other } } #[pymethods] impl ColumnInfoNoDtype { #[getter("name")] /// `str`. The name of the column pub fn py_name(&self) -> &str { &self.name } #[getter("index")] /// `int`. The index of the column pub fn py_index(&self) -> usize { self.index } #[getter("absolute_index")] /// `int`. The absolute index of the column pub fn py_absolute_index(&self) -> usize { self.absolute_index } } ================================================ FILE: src/types/excelsheet/mod.rs ================================================ pub(crate) mod column_info; #[cfg(feature = "polars")] mod polars; #[cfg(feature = "python")] mod python; pub(crate) mod table; #[cfg(feature = "python")] use std::sync::Arc; use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr}; use calamine::{CellType, Range, Sheet as CalamineSheet, SheetVisible as CalamineSheetVisible}; use column_info::{AvailableColumns, ColumnInfoNoDtype}; #[cfg(feature = "polars")] use polars_core::frame::DataFrame; #[cfg(feature = "python")] use pyo3::{Py, PyAny, Python, pyclass}; use self::column_info::{ColumnInfo, build_available_columns_info, finalize_column_info}; use crate::utils::schema::get_schema_sample_rows; use crate::{ LoadSheetOrTableOptions, data::{ExcelSheetData, FastExcelColumn}, error::{ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult}, types::{dtype::DTypes, idx_or_name::IdxOrName}, }; #[cfg(feature = "python")] pub(crate) use python::{CellError, CellErrors}; #[derive(Debug)] pub(crate) enum Header { None, At(usize), With(Vec), } impl Header { pub(crate) fn new(header_row: Option, column_names: Option>) -> Self { match column_names { Some(headers) => Header::With(headers), None => match header_row { Some(row) => Header::At(row), None => Header::None, }, } } pub(crate) fn offset(&self) -> usize { match self { Header::At(index) => index + 1, Header::None => 0, Header::With(_) => 0, } } } #[derive(Debug, Clone)] #[cfg_attr(not(feature = "python"), derive(PartialEq, Eq))] pub(crate) struct Pagination { skip_rows: SkipRows, n_rows: Option, } /// How rows should be skipped. #[derive(Debug, Default, Clone)] #[cfg_attr(not(feature = "python"), derive(PartialEq, Eq))] pub enum SkipRows { /// Skip a fixed number of rows. Simple(usize), /// Skip rows based on a list of row indices. List(HashSet), #[cfg(feature = "python")] Callable(Arc>), /// Skip empty rows at the beginning of the file (default). #[default] SkipEmptyRowsAtBeginning, } impl SkipRows { pub(crate) fn simple_offset(&self) -> Option { match self { SkipRows::Simple(offset) => Some(*offset), SkipRows::SkipEmptyRowsAtBeginning => Some(0), // Let calamine's FirstNonEmptyRow handle it _ => None, } } } impl Pagination { pub(crate) fn try_new( skip_rows: SkipRows, n_rows: Option, range: &Range, ) -> FastExcelResult { let max_height = range.height(); // Only validate for simple skip_rows case if let SkipRows::Simple(skip_count) = &skip_rows { if max_height < *skip_count { return Err(FastExcelErrorKind::InvalidParameters(format!( "Too many rows skipped. Max height is {max_height}" )) .into()); } } Ok(Self { skip_rows, n_rows }) } pub(crate) fn offset(&self) -> usize { self.skip_rows.simple_offset().unwrap_or(0) } pub(crate) fn n_rows(&self) -> Option { self.n_rows } pub(crate) fn skip_rows(&self) -> &SkipRows { &self.skip_rows } } #[derive(Default)] pub enum SelectedColumns { #[default] All, Selection(Vec), #[cfg(feature = "python")] DynamicSelection(Py), DeferredSelection(Vec), } #[derive(Debug, Clone, PartialEq)] pub enum DeferredColumnSelection { Fixed(IdxOrName), /// start column index, end is determined by sheet width OpenEndedRange(usize), /// end column index, start is 0 FromBeginningRange(usize), } impl std::fmt::Debug for SelectedColumns { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::All => write!(f, "All"), Self::Selection(selection) => write!(f, "Selection({selection:?})"), #[cfg(feature = "python")] Self::DynamicSelection(func) => { let addr = func as *const _ as usize; write!(f, "DynamicSelection({addr})") } Self::DeferredSelection(deferred) => write!(f, "DeferredSelection({deferred:?})"), } } } impl PartialEq for SelectedColumns { fn eq(&self, other: &Self) -> bool { match (self, other) { (Self::All, Self::All) => true, (Self::Selection(selection), Self::Selection(other_selection)) => { selection == other_selection } #[cfg(feature = "python")] (Self::DynamicSelection(f1), Self::DynamicSelection(f2)) => std::ptr::eq(f1, f2), (Self::DeferredSelection(deferred1), Self::DeferredSelection(deferred2)) => { deferred1 == deferred2 } _ => false, } } } pub(crate) fn deferred_selection_to_concrete( deferred_selection: &[DeferredColumnSelection], max_col_index: usize, ) -> Vec { // First, resolve all deferred selections into concrete column indices let mut resolved_indices = Vec::new(); for deferred in deferred_selection { match deferred { DeferredColumnSelection::Fixed(idx_or_name) => { resolved_indices.push(idx_or_name.clone()); } DeferredColumnSelection::OpenEndedRange(start_idx) => { // Add all columns from start_idx to the end resolved_indices.extend((*start_idx..=max_col_index).map(IdxOrName::Idx)); } DeferredColumnSelection::FromBeginningRange(end_idx) => { // Add all columns from 0 to end_idx (inclusive) let actual_end = (*end_idx).min(max_col_index); resolved_indices.extend((0..=actual_end).map(IdxOrName::Idx)); } } } resolved_indices } impl SelectedColumns { pub(super) fn select_columns( &self, available_columns: Vec, ) -> FastExcelResult> { match self { SelectedColumns::All => Ok(available_columns), SelectedColumns::Selection(selection) => { let selected_indices: Vec = selection .iter() .map(|selected_column| { match selected_column { IdxOrName::Idx(index) => available_columns .iter() // Sheets have absolute column names (A, B, C, ...) .position(|col_info| &col_info.absolute_index() == index), IdxOrName::Name(name) => available_columns .iter() .position(|col_info| col_info.name() == name.as_str()), } .ok_or_else(|| { FastExcelErrorKind::ColumnNotFound(selected_column.clone()).into() }) .with_context(|| format!("available columns are: {available_columns:?}")) }) .collect::>()?; // We need to sort `available_columns` based on the order of the provided selection. // First, we associated every element in the Vec with its position in the selection, // and we filter out unselected columns let mut cols: Vec<(usize, ColumnInfoNoDtype)> = available_columns .into_iter() .enumerate() .filter_map(|(idx, elem)| { selected_indices .iter() .position(|selected_idx| *selected_idx == idx) .map(|position| (position, elem)) }) .collect(); // Then, we sort the columns based on their position in the selection cols.sort_by_key(|(pos, _elem)| *pos); // And finally, we drop the positions Ok(cols.into_iter().map(|(_pos, elem)| elem).collect()) } #[cfg(feature = "python")] SelectedColumns::DynamicSelection(use_col_func) => Python::attach(|py| { available_columns .into_iter() .filter_map( |col_info| match use_col_func.call1(py, (col_info.clone(),)) { Err(err) => Some(Err(FastExcelErrorKind::InvalidParameters(format!( "`use_columns` callable could not be called ({err})" )) .into())), Ok(should_use_col) => match should_use_col.extract::(py) { Err(_) => Some(Err(FastExcelErrorKind::InvalidParameters( "`use_columns` callable should return a boolean".to_string(), ) .into())), Ok(true) => Some(Ok(col_info)), Ok(false) => None, }, }, ) .collect() }), SelectedColumns::DeferredSelection(deferred_selection) => { let max_col_index = available_columns .last() .map_or(0, |col| col.absolute_index()); let concrete_selection = SelectedColumns::Selection( deferred_selection_to_concrete(deferred_selection, max_col_index), ); concrete_selection.select_columns(available_columns) } } } const ALPHABET: [char; 26] = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', ]; fn col_idx_for_col_as_letter(col: &str) -> FastExcelResult { use FastExcelErrorKind::InvalidParameters; if col.is_empty() { return Err(InvalidParameters( "a column should have at least one character, got none".to_string(), ) .into()); } col.chars() // iterating over all chars reversed, to have a power based on their rank .rev() .enumerate() // Parses every char, checks its position and returns its numeric equivalent based on // its rank. For example, AB becomes 27 (26 + 1) .map(|(idx, col_chr)| { let pos_in_alphabet = Self::ALPHABET .iter() .position(|chr| chr == &col_chr) .ok_or_else(|| { FastExcelError::from(InvalidParameters(format!( "Char is not a valid column name: {col_chr}" ))) })?; Ok(match idx { // in case it's the last char, just return its position 0 => pos_in_alphabet, // otherwise, 26^idx * (position + 1) // For example, CBA is 2081: // A -> 0 // B -> 26 (53^1 * (1 + 1)) // C -> 2028 (26^2 * (2 + 1)) _ => 26usize.pow(idx as u32) * (pos_in_alphabet + 1), }) }) // Sums all previously obtained ranks .try_fold(0usize, |acc, elem_result| { elem_result.map(|elem| acc + elem) }) } fn col_indices_for_letter_range(col_range: &str) -> FastExcelResult> { use FastExcelErrorKind::InvalidParameters; let col_elements = col_range.split(':').collect::>(); if col_elements.len() == 2 { let start = Self::col_idx_for_col_as_letter(col_elements[0]) .with_context(|| format!("invalid start element for range \"{col_range}\""))?; // Check if this is an open-ended range (empty end element) if col_elements[1].is_empty() { // For open-ended ranges, we can't return concrete indices yet // This will be handled differently in the parsing logic return Err(InvalidParameters(format!( "open-ended range detected: \"{col_range}\". This should be handled by col_selection_for_letter_range" )) .into()); } let end = Self::col_idx_for_col_as_letter(col_elements[1]) .with_context(|| format!("invalid end element for range \"{col_range}\""))?; match start.cmp(&end) { cmp::Ordering::Less => Ok((start..=end).collect()), cmp::Ordering::Greater => Err(InvalidParameters(format!( "end of range is before start: \"{col_range}\"" )) .into()), cmp::Ordering::Equal => { Err(InvalidParameters(format!("empty range: \"{col_range}\"")).into()) } } } else { Err(InvalidParameters(format!( "expected range to contain exactly 2 elements, got {n_elements}: \"{col_range}\"", n_elements = col_elements.len() )) .into()) } } fn col_selection_for_letter_range( col_range: &str, ) -> FastExcelResult> { use FastExcelErrorKind::InvalidParameters; let col_elements = col_range.split(':').collect::>(); if col_elements.len() == 2 { // Check if this is a from-beginning range (empty start element) if col_elements[0].is_empty() { if col_elements[1].is_empty() { return Err(InvalidParameters(format!( "cannot have both start and end empty in range: \"{col_range}\"" )) .into()); } let end = Self::col_idx_for_col_as_letter(col_elements[1]) .with_context(|| format!("invalid end element for range \"{col_range}\""))?; return Ok(vec![DeferredColumnSelection::FromBeginningRange(end)]); } let start = Self::col_idx_for_col_as_letter(col_elements[0]) .with_context(|| format!("invalid start element for range \"{col_range}\""))?; // Check if this is an open-ended range (empty end element) if col_elements[1].is_empty() { return Ok(vec![DeferredColumnSelection::OpenEndedRange(start)]); } let end = Self::col_idx_for_col_as_letter(col_elements[1]) .with_context(|| format!("invalid end element for range \"{col_range}\""))?; match start.cmp(&end) { cmp::Ordering::Less => Ok((start..=end) .map(|idx| DeferredColumnSelection::Fixed(IdxOrName::Idx(idx))) .collect()), cmp::Ordering::Greater => Err(InvalidParameters(format!( "end of range is before start: \"{col_range}\"" )) .into()), cmp::Ordering::Equal => { Err(InvalidParameters(format!("empty range: \"{col_range}\"")).into()) } } } else { Err(InvalidParameters(format!( "expected range to contain exactly 2 elements, got {n_elements}: \"{col_range}\"", n_elements = col_elements.len() )) .into()) } } } impl FromStr for SelectedColumns { type Err = FastExcelError; fn from_str(s: &str) -> FastExcelResult { let uppercase_s = s.to_uppercase(); let parts: Vec<&str> = uppercase_s.split(',').collect(); let has_open_ended = parts .iter() .any(|p| p.contains(':') && (p.ends_with(':') || p.starts_with(':'))); if has_open_ended { // Use deferred selection logic let deferred_selections = parts .iter() .map(|part| { if part.contains(':') { Self::col_selection_for_letter_range(part).map(|mut selections| { std::mem::take(&mut selections) .into_iter() .collect::>() }) } else { Self::col_idx_for_col_as_letter(part) .map(|idx| vec![DeferredColumnSelection::Fixed(IdxOrName::Idx(idx))]) } }) .collect::>, _>>()? .into_iter() .flatten() .collect(); Ok(Self::DeferredSelection(deferred_selections)) } else { // Use the original immediate resolution logic for backwards compatibility let unique_col_indices: HashSet = parts .iter() .map(|col_or_range| { if col_or_range.contains(':') { Self::col_indices_for_letter_range(col_or_range) } else { Self::col_idx_for_col_as_letter(col_or_range).map(|idx| vec![idx]) } }) .collect::>>()? .into_iter() .flatten() .collect(); let mut sorted_col_indices: Vec = unique_col_indices.into_iter().collect(); sorted_col_indices.sort(); Ok(Self::Selection( sorted_col_indices.into_iter().map(IdxOrName::Idx).collect(), )) } } } /// Visibility of a sheet. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum SheetVisible { Visible, Hidden, VeryHidden, } impl From for SheetVisible { fn from(value: CalamineSheetVisible) -> Self { match value { CalamineSheetVisible::Visible => SheetVisible::Visible, CalamineSheetVisible::Hidden => SheetVisible::Hidden, CalamineSheetVisible::VeryHidden => SheetVisible::VeryHidden, } } } /// A single sheet in an Excel file. #[derive(Debug)] #[cfg_attr(feature = "python", pyclass(name = "_ExcelSheet"))] pub struct ExcelSheet { sheet_meta: CalamineSheet, header: Header, pagination: Pagination, data: ExcelSheetData<'static>, height: Option, total_height: Option, width: Option, limit: usize, opts: LoadSheetOrTableOptions, selected_columns: Vec, available_columns: AvailableColumns, } impl ExcelSheet { pub(crate) fn data(&self) -> &ExcelSheetData<'_> { &self.data } pub(crate) fn try_new( sheet_meta: CalamineSheet, data: ExcelSheetData<'static>, opts: LoadSheetOrTableOptions, ) -> FastExcelResult { let header = Header::new(opts.data_header_row(), opts.column_names.clone()); let available_columns_info = build_available_columns_info(&data, &opts.selected_columns, &header)?; let selected_columns_info = opts .selected_columns .select_columns(available_columns_info)?; let pagination = match &data { ExcelSheetData::Owned(range) => { Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, range)? } ExcelSheetData::Ref(range) => { Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, range)? } }; let mut sheet = ExcelSheet { sheet_meta, header, pagination, data, opts, height: None, total_height: None, width: None, // Will be replaced limit: 0, available_columns: AvailableColumns::Pending, // Empty vec as It'll be replaced selected_columns: Vec::with_capacity(0), }; sheet.limit = sheet.compute_limit(); // Finalizing column info (figure out dtypes for every column) let row_limit = sheet.schema_sample_rows(); let selected_columns = finalize_column_info( selected_columns_info, &sheet.data, sheet.offset(), row_limit, sheet.opts.dtypes.as_ref(), &sheet.opts.dtype_coercion, sheet.opts.whitespace_as_null, )?; sheet.selected_columns = selected_columns; Ok(sheet) } fn ensure_available_columns_loaded(&mut self) -> FastExcelResult<()> { let available_columns = match &self.available_columns { AvailableColumns::Pending => { let available_columns_info = build_available_columns_info( &self.data, &self.opts.selected_columns, &self.header, )?; let final_info = finalize_column_info( available_columns_info, self.data(), self.offset(), self.limit(), self.opts.dtypes.as_ref(), &self.opts.dtype_coercion, self.opts.whitespace_as_null, )?; AvailableColumns::Loaded(final_info) } AvailableColumns::Loaded(_) => return Ok(()), }; self.available_columns = available_columns; Ok(()) } fn load_available_columns(&mut self) -> FastExcelResult<&[ColumnInfo]> { self.ensure_available_columns_loaded()?; self.available_columns.as_loaded() } fn compute_limit(&self) -> usize { let upper_bound = if self.opts.skip_whitespace_tail_rows { self.data.height_without_tail_whitespace() } else { self.data.height() }; if let Some(n_rows) = self.pagination.n_rows { let limit = self.offset() + n_rows; if limit < upper_bound { return limit; } } upper_bound } pub(crate) fn limit(&self) -> usize { self.limit } pub(crate) fn schema_sample_rows(&self) -> usize { get_schema_sample_rows(self.opts.schema_sample_rows, self.offset(), self.limit()) } pub fn width(&mut self) -> usize { self.width.unwrap_or_else(|| { let width = self.data.width(); self.width = Some(width); width }) } pub fn height(&mut self) -> usize { self.height.unwrap_or_else(|| { use crate::data::generate_row_selector; let height = generate_row_selector(self.pagination.skip_rows(), self.offset(), self.limit()) .map(|selector| selector.len()) .unwrap_or_else(|_| self.limit() - self.offset()); self.height = Some(height); height }) } pub fn total_height(&mut self) -> usize { self.total_height.unwrap_or_else(|| { let total_height = self.data.height() - self.header.offset(); self.total_height = Some(total_height); total_height }) } pub fn offset(&self) -> usize { self.header.offset() + self.pagination.offset() } pub fn selected_columns(&self) -> &Vec { &self.selected_columns } pub fn available_columns(&mut self) -> FastExcelResult> { self.load_available_columns().map(|cols| cols.to_vec()) } pub fn specified_dtypes(&self) -> Option<&DTypes> { self.opts.dtypes.as_ref() } pub fn name(&self) -> &str { &self.sheet_meta.name } pub fn visible(&self) -> SheetVisible { self.sheet_meta.visible.into() } pub fn to_columns(&self) -> FastExcelResult> { self.selected_columns .iter() .map(|column_info| { let offset = self.offset(); let limit = self.limit(); let whitespace_as_null = self.opts.whitespace_as_null; match self.data() { ExcelSheetData::Owned(range) => FastExcelColumn::try_from_column_info( column_info, range, offset, limit, whitespace_as_null, ), ExcelSheetData::Ref(range) => FastExcelColumn::try_from_column_info( column_info, range, offset, limit, whitespace_as_null, ), } }) .collect() } #[cfg(feature = "polars")] pub fn to_polars(&self) -> FastExcelResult { let pl_columns = self.to_columns()?.into_iter().map(Into::into).collect(); DataFrame::new_infer_height(pl_columns).map_err(|err| { FastExcelErrorKind::Internal(format!("could not create DataFrame: {err:?}")).into() }) } } #[cfg(feature = "__pyo3-tests")] #[cfg(test)] mod tests { use super::*; use pretty_assertions::assert_eq; use pyo3::{ prelude::PyListMethods, types::{PyList, PyString}, }; use rstest::rstest; #[test] fn selected_columns_from_none() { assert_eq!( TryInto::::try_into(None).unwrap(), SelectedColumns::All ) } #[test] fn selected_columns_from_list_of_valid_ints() { Python::attach(|py| { let py_list = PyList::new(py, vec![0, 1, 2]).expect("could not create PyList"); assert_eq!( TryInto::::try_into(Some(py_list.as_ref())).unwrap(), SelectedColumns::Selection([0, 1, 2].into_iter().map(IdxOrName::Idx).collect()) ) }); } #[test] fn selected_columns_from_list_of_valid_strings() { Python::attach(|py| { let py_list = PyList::new(py, vec!["foo", "bar"]).expect("could not create PyList"); assert_eq!( TryInto::::try_into(Some(py_list.as_ref())).unwrap(), SelectedColumns::Selection( ["foo", "bar"] .iter() .map(ToString::to_string) .map(IdxOrName::Name) .collect() ) ) }); } #[test] fn selected_columns_from_list_of_valid_strings_and_ints() { Python::attach(|py| { let py_list = PyList::new(py, vec!["foo", "bar"]).expect("could not create PyList"); py_list.append(42).unwrap(); py_list.append(5).unwrap(); assert_eq!( TryInto::::try_into(Some(py_list.as_ref())).unwrap(), SelectedColumns::Selection(vec![ IdxOrName::Name("foo".to_string()), IdxOrName::Name("bar".to_string()), IdxOrName::Idx(42), IdxOrName::Idx(5) ]) ) }); } #[test] fn selected_columns_from_invalid_ints() { Python::attach(|py| { let py_list = PyList::new(py, vec![0, 2, -1]).expect("could not create PyList"); let err = TryInto::::try_into(Some(py_list.as_ref())).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); }); } #[test] fn selected_columns_from_empty_int_list() { Python::attach(|py| { let py_list = PyList::new(py, Vec::::new()).expect("could not create PyList"); let err = TryInto::::try_into(Some(py_list.as_ref())).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); }); } #[test] fn selected_columns_from_empty_string_list() { Python::attach(|py| { let py_list = PyList::new(py, Vec::::new()).expect("could not create PyList"); let err = TryInto::::try_into(Some(py_list.as_ref())).unwrap_err(); assert!(matches!(err.kind, FastExcelErrorKind::InvalidParameters(_))); }); } #[rstest] // Standard unique columns #[case("A,B,D", vec![0, 1, 3])] // Standard unique columns + range #[case("A,B:E,Y", vec![0, 1, 2, 3, 4, 24])] // Standard unique column + ranges with mixed case #[case("A:c,b:E,w,Y:z", vec![0, 1, 2, 3, 4, 22, 24, 25])] // Ranges beyond Z #[case("A,y:AB", vec![0, 24, 25, 26, 27])] #[case("BB:BE,DDC:DDF", vec![53, 54, 55, 56, 2810, 2811, 2812, 2813])] fn selected_columns_from_valid_ranges(#[case] raw: &str, #[case] expected_indices: Vec) { Python::attach(|py| { let expected_range = SelectedColumns::Selection( expected_indices.into_iter().map(IdxOrName::Idx).collect(), ); let input = PyString::new(py, raw); let range = TryInto::::try_into(Some(input.as_ref())) .expect("expected a valid column selection"); assert_eq!(range, expected_range) }) } #[rstest] #[case("B:")] #[case("A,C:")] #[case("A:")] #[case(":E")] #[case(":C")] #[case(":A")] #[case(":C,E:")] fn selected_columns_from_valid_open_ended_ranges(#[case] raw: &str) { Python::attach(|py| { let input = PyString::new(py, raw); let range = TryInto::::try_into(Some(input.as_ref())) .expect("expected a valid column selection"); assert!(matches!(range, SelectedColumns::DeferredSelection(_))); }) } #[rstest] // Standard unique columns #[case("", "at least one character")] // empty range #[case("a:a,b:d,e", "empty range")] // end before start #[case("b:a", "end of range is before start")] // both start and end empty #[case(":", "cannot have both start and end empty")] // too many elements #[case("a:b:e", "exactly 2 elements, got 3")] fn selected_columns_from_invalid_ranges(#[case] raw: &str, #[case] message: &str) { Python::attach(|py| { let input = PyString::new(py, raw); let err = TryInto::::try_into(Some(input.as_ref())) .expect_err("expected an error"); match err.kind { FastExcelErrorKind::InvalidParameters(detail) => { if !detail.contains(message) { panic!("expected \"{detail}\" to contain \"{message}\"") } } _ => panic!("Expected error to be InvalidParameters, got {err:?}"), } }) } } ================================================ FILE: src/types/excelsheet/polars.rs ================================================ use crate::{FastExcelColumn, FastExcelSeries}; use polars_core::{ frame::column::{Column as PolarsColumn, ScalarColumn}, prelude::DataType, scalar::Scalar, }; impl From for PolarsColumn { fn from(column: FastExcelColumn) -> Self { let name = column.name().into(); match column.data { FastExcelSeries::Null => PolarsColumn::Scalar(ScalarColumn::new( name, Scalar::null(DataType::Null), column.len(), )), FastExcelSeries::Bool(values) => PolarsColumn::new(name, values), FastExcelSeries::String(values) => PolarsColumn::new(name, values), FastExcelSeries::Int(values) => PolarsColumn::new(name, values), FastExcelSeries::Float(values) => PolarsColumn::new(name, values), FastExcelSeries::Datetime(values) => PolarsColumn::new(name, values), FastExcelSeries::Date(values) => PolarsColumn::new(name, values), FastExcelSeries::Duration(values) => PolarsColumn::new(name, values), } } } ================================================ FILE: src/types/excelsheet/python.rs ================================================ use std::{collections::HashSet, sync::Arc}; use arrow_array::{RecordBatch, StructArray}; use arrow_schema::Field; use pyo3::{ Borrowed, Bound, FromPyObject, IntoPyObject, Py, PyAny, PyErr, PyResult, Python, pyclass, pymethods, types::{PyAnyMethods, PyCapsule, PyList, PyListMethods, PyString, PyTuple}, }; use pyo3_arrow::ffi::{to_array_pycapsules, to_schema_pycapsule}; use crate::{ ExcelSheet, data::{ ExcelSheetData, record_batch_from_data_and_columns_with_skip_rows, selected_columns_to_schema, }, error::{ ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult, }, types::{ dtype::DTypes, excelsheet::{SelectedColumns, SheetVisible, SkipRows, column_info::ColumnInfo}, idx_or_name::IdxOrName, }, }; impl TryFrom<&Bound<'_, PyList>> for SelectedColumns { type Error = FastExcelError; fn try_from(py_list: &Bound<'_, PyList>) -> FastExcelResult { use FastExcelErrorKind::InvalidParameters; if py_list.is_empty() { Err(InvalidParameters("list of selected columns is empty".to_string()).into()) } else if let Ok(selection) = py_list.extract::>() { Ok(Self::Selection(selection)) } else { Err( InvalidParameters(format!("expected list[int] | list[str], got {py_list:?}")) .into(), ) } } } impl TryFrom>> for SelectedColumns { type Error = FastExcelError; fn try_from(py_any_opt: Option<&Bound<'_, PyAny>>) -> FastExcelResult { match py_any_opt { None => Ok(Self::All), Some(py_any) => { // Not trying to downcast to PyNone here as we assume that this would result in // py_any_opt being None if let Ok(py_str) = py_any.extract::() { py_str.parse() } else if let Ok(py_list) = py_any.cast::() { py_list.try_into() } else if let Ok(py_function) = py_any.extract::>() { Ok(Self::DynamicSelection(py_function)) } else { Err(FastExcelErrorKind::InvalidParameters(format!( "unsupported object type {object_type}", object_type = py_any.get_type() )) .into()) } } .with_context(|| { format!("could not determine selected columns from provided object: {py_any}") }), } } } impl<'py> IntoPyObject<'py> for &SheetVisible { type Target = PyString; type Output = Bound<'py, Self::Target>; type Error = FastExcelError; fn into_pyobject(self, py: Python<'py>) -> Result { Ok(PyString::new( py, match self { SheetVisible::Visible => "visible", SheetVisible::Hidden => "hidden", SheetVisible::VeryHidden => "veryhidden", }, )) } } impl SkipRows { pub(crate) fn should_skip_row(&self, row_idx: usize, py: Python) -> FastExcelResult { match self { SkipRows::Simple(offset) => Ok(row_idx < *offset), SkipRows::List(skip_set) => Ok(skip_set.contains(&row_idx)), SkipRows::Callable(func) => { let result = func.call1(py, (row_idx,)).map_err(|e| { FastExcelErrorKind::InvalidParameters(format!( "Error calling skip_rows function for row {row_idx}: {e}" )) })?; result.extract::(py).map_err(|e| { FastExcelErrorKind::InvalidParameters(format!( "skip_rows callable must return bool, got error: {e}" )) .into() }) } SkipRows::SkipEmptyRowsAtBeginning => { // This is handled by calamine's FirstNonEmptyRow in the header logic // For array creation, we don't need additional filtering Ok(false) } } } } #[derive(Debug, Clone)] #[pyclass(skip_from_py_object)] pub(crate) struct CellError { /// `(int, int)`. The original row and column of the error #[pyo3(get)] pub position: (usize, usize), /// `int`. The row offset #[pyo3(get)] pub row_offset: usize, /// `str`. The error message #[pyo3(get)] pub detail: String, } #[pymethods] impl CellError { #[getter] pub fn offset_position(&self) -> (usize, usize) { let (row, col) = self.position; (row - self.row_offset, col) } pub fn __repr__(&self) -> String { let (row, col) = self.position; let (offset_row, offset_col) = self.offset_position(); format!( "CellError(position=({row}, {col}), offset_position=({offset_row}, {offset_col}), row_offset={row_offset}, detail={detail:?})", row_offset = self.row_offset, detail = &self.detail, ) } } #[pyclass] pub(crate) struct CellErrors { pub errors: Vec, } #[pymethods] impl CellErrors { #[getter] pub fn errors<'p>(&'p self, _py: Python<'p>) -> Vec { self.errors.clone() } pub fn __repr__(&self) -> String { let errors_repr: Vec = self.errors.iter().map(|e| e.__repr__()).collect(); format!("CellErrors(errors=[{}])", errors_repr.join(", ")) } } impl<'a, 'py> FromPyObject<'a, 'py> for SkipRows { type Error = PyErr; fn extract(obj: Borrowed<'a, 'py, PyAny>) -> Result { // Handle None case if obj.is_none() { return Ok(SkipRows::SkipEmptyRowsAtBeginning); } // Try to extract as int first if let Ok(skip_count) = obj.extract::() { return Ok(SkipRows::Simple(skip_count)); } // Try to extract as list of integers if let Ok(skip_list) = obj.extract::>() { let skip_set: HashSet = skip_list.into_iter().collect(); return Ok(SkipRows::List(skip_set)); } // Check if it's callable if obj.hasattr("__call__").unwrap_or(false) { return Ok(SkipRows::Callable(Arc::new(obj.to_owned().into()))); } Err(FastExcelErrorKind::InvalidParameters( "skip_rows must be int, list of int, callable, or None".to_string(), ) .into()) .into_pyresult() } } impl TryFrom<&ExcelSheet> for RecordBatch { type Error = FastExcelError; fn try_from(sheet: &ExcelSheet) -> FastExcelResult { let offset = sheet.offset(); let limit = sheet.limit(); match &sheet.data { ExcelSheetData::Owned(range) => record_batch_from_data_and_columns_with_skip_rows( &sheet.selected_columns, range, sheet.pagination.skip_rows(), offset, limit, sheet.opts.whitespace_as_null, ), ExcelSheetData::Ref(range) => record_batch_from_data_and_columns_with_skip_rows( &sheet.selected_columns, range, sheet.pagination.skip_rows(), offset, limit, sheet.opts.whitespace_as_null, ), } .with_context(|| format!("could not convert sheet {} to RecordBatch", sheet.name())) } } // NOTE: These proxy python implems are required because `#[getter]` does not play well with `cfg_attr`: // * https://github.com/PyO3/pyo3/issues/1003 // * https://github.com/PyO3/pyo3/issues/780 #[pymethods] impl ExcelSheet { #[getter("width")] pub fn py_width(&mut self) -> usize { self.width() } #[getter("height")] pub fn py_height(&mut self) -> usize { self.height() } #[getter("total_height")] pub fn py_total_height(&mut self) -> usize { self.total_height() } #[getter("offset")] pub fn py_offset(&self) -> usize { self.offset() } #[getter("selected_columns")] pub fn py_selected_columns(&self) -> Vec { self.selected_columns().to_owned() } #[pyo3(name = "available_columns")] pub fn py_available_columns(&mut self) -> FastExcelResult> { self.available_columns() } #[getter("specified_dtypes")] pub fn py_specified_dtypes(&self) -> Option<&DTypes> { self.specified_dtypes() } #[getter("name")] pub fn py_name(&self) -> &str { self.name() } #[getter("visible")] pub fn py_visible<'py>(&'py self, py: Python<'py>) -> FastExcelResult> { let visible: SheetVisible = self.visible(); (&visible).into_pyobject(py) } #[cfg(feature = "pyarrow")] pub fn to_arrow<'py>(&self, py: Python<'py>) -> PyResult> { use pyo3::IntoPyObjectExt; use crate::error::py_errors::IntoPyResult; py.detach(|| RecordBatch::try_from(self)) .with_context(|| { format!( "could not create RecordBatch from sheet \"{}\"", self.name() ) }) .and_then(|rb| { use arrow_pyarrow::ToPyArrow; rb.to_pyarrow(py) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) }) .with_context(|| { format!( "could not convert RecordBatch to pyarrow for sheet \"{}\"", self.name() ) }) .into_pyresult() .and_then(|obj| obj.into_bound_py_any(py)) } #[cfg(feature = "pyarrow")] pub fn to_arrow_with_errors<'py>(&self, py: Python<'py>) -> PyResult> { use arrow_pyarrow::IntoPyArrow; use pyo3::IntoPyObjectExt; use crate::data::record_batch_from_data_and_columns_with_errors; let offset = self.offset(); let limit = self.limit(); let (rb, errors) = py .detach(|| { record_batch_from_data_and_columns_with_errors( &self.selected_columns, self.data(), offset, limit, self.opts.whitespace_as_null, ) }) .with_context(|| { format!( "could not create RecordBatch from sheet \"{}\"", self.name() ) })?; let rb = rb .into_pyarrow(py) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) .with_context(|| { format!( "could not convert RecordBatch to pyarrow for sheet \"{}\"", self.name() ) })?; (rb, errors).into_bound_py_any(py) } /// Export the schema as an [`ArrowSchema`] [`PyCapsule`]. /// /// /// /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema /// [`PyCapsule`]: pyo3::types::PyCapsule pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult> { let schema = selected_columns_to_schema(&self.selected_columns); Ok(to_schema_pycapsule(py, &schema)?) } /// Export the schema and data as a pair of [`ArrowSchema`] and [`ArrowArray`] [`PyCapsules`] /// /// The optional `requested_schema` parameter allows for potential schema conversion. /// /// /// /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema /// [`ArrowArray`]: arrow_array::ffi::FFI_ArrowArray /// [`PyCapsules`]: pyo3::types::PyCapsule pub fn __arrow_c_array__<'py>( &self, py: Python<'py>, requested_schema: Option>, ) -> PyResult> { let record_batch = RecordBatch::try_from(self) .with_context(|| { format!( "could not create RecordBatch from sheet \"{}\"", self.name() ) }) .into_pyresult()?; let field = Field::new_struct("", record_batch.schema_ref().fields().clone(), false); let array = Arc::new(StructArray::from(record_batch)); Ok(to_array_pycapsules( py, field.into(), array.as_ref(), requested_schema, )?) } pub fn __repr__(&self) -> String { format!("ExcelSheet<{}>", self.name()) } } ================================================ FILE: src/types/excelsheet/table.rs ================================================ use crate::error::{FastExcelErrorKind, FastExcelResult}; use calamine::{Data, Sheets, Table}; use std::io::{Read, Seek}; pub(crate) fn extract_table_names<'a, RS: Read + Seek>( sheets: &'a mut Sheets, sheet_name: Option<&str>, ) -> FastExcelResult> { match sheets { Sheets::Xlsx(xlsx) => { // Internally checks if tables already loaded; is fast xlsx.load_tables()?; match sheet_name { None => Ok(xlsx.table_names()), Some(sn) => Ok(xlsx.table_names_in_sheet(sn)), } } _ => Err(FastExcelErrorKind::Internal( "Currently only XLSX files are supported for tables".to_string(), ) .into()), } } pub(crate) fn extract_table_range( name: &str, sheets: &mut Sheets, ) -> FastExcelResult> { match sheets { Sheets::Xlsx(xlsx) => { // Internally checks if tables already loaded; is fast xlsx.load_tables()?; let table_result = xlsx.table_by_name(name); let table = table_result?; Ok(table) } _ => Err(FastExcelErrorKind::Internal( "Currently only XLSX files are supported for tables".to_string(), ) .into()), } } ================================================ FILE: src/types/exceltable/mod.rs ================================================ #[cfg(feature = "python")] mod python; use calamine::{Data, Range, Table}; #[cfg(feature = "polars")] use polars_core::frame::DataFrame; #[cfg(feature = "python")] use pyo3::pyclass; use crate::{ FastExcelColumn, FastExcelErrorKind, IdxOrName, LoadSheetOrTableOptions, SelectedColumns, data::height_without_tail_whitespace, error::{ErrorContext, FastExcelResult}, types::{ dtype::DTypes, excelsheet::{ Header, Pagination, column_info::{ AvailableColumns, ColumnInfo, build_available_columns_info, finalize_column_info, }, deferred_selection_to_concrete, }, }, utils::schema::get_schema_sample_rows, }; /// A single table in an Excel file. #[derive(Debug)] #[cfg_attr(feature = "python", pyclass(name = "_ExcelTable"))] pub struct ExcelTable { name: String, sheet_name: String, selected_columns: Vec, available_columns: AvailableColumns, table: Table, header: Header, pagination: Pagination, opts: LoadSheetOrTableOptions, height: Option, total_height: Option, width: Option, limit: usize, } impl ExcelTable { fn extract_selected_columns_and_table_columns( table: &Table, selected_columns: &[IdxOrName], ) -> FastExcelResult<(Vec, Vec)> { let table_columns: Vec = table.columns().into(); let column_offset = table.data().start().map_or(0, |(_row, col)| col as usize); let selected_column_indices = selected_columns .iter() .map(|idx_or_name| match idx_or_name { IdxOrName::Idx(idx) => Ok(*idx), IdxOrName::Name(name) => table_columns .iter() .enumerate() .find_map(|(idx, col_name)| { (col_name.as_str() == name.as_str()).then_some(idx + column_offset) }) .ok_or_else(|| FastExcelErrorKind::ColumnNotFound(name.clone().into()).into()) .with_context(|| format!("available columns are: {table_columns:?}")), }) .collect::>>()?; let table_columns = table_columns .into_iter() .enumerate() .filter_map(|(idx, col_name)| { selected_column_indices .contains(&(idx + column_offset)) .then_some(col_name) }) .collect(); let selected_columns = selected_column_indices .into_iter() .map(Into::into) .collect(); Ok((table_columns, selected_columns)) } /// Builds a `Header` for a table. This might update the column selection, if provided fn build_header_and_update_selection( table: &Table, opts: LoadSheetOrTableOptions, ) -> FastExcelResult<(Header, LoadSheetOrTableOptions)> { Ok(match (&opts.column_names, opts.header_row) { (None, None) => { // If there is a column selection, we need to convert all elements to column // indices. This is required because we will be providing the header, and it // it is required to use an index-based selection when custom column names are provided match &opts.selected_columns { SelectedColumns::Selection(selected_columns) => { let (table_columns, selected_columns) = Self::extract_selected_columns_and_table_columns( table, selected_columns, )?; let opts = opts.selected_columns(SelectedColumns::Selection(selected_columns)); (Header::With(table_columns), opts) } SelectedColumns::DeferredSelection(deferred_selection) => { let concrete_columns = deferred_selection_to_concrete( deferred_selection, table.data().end().map_or(0, |(_row, col)| col as usize), ); let (table_columns, selected_columns) = Self::extract_selected_columns_and_table_columns( table, &concrete_columns, )?; let opts = opts.selected_columns(SelectedColumns::Selection(selected_columns)); (Header::With(table_columns), opts) } _ => (Header::With(table.columns().into()), opts), } } (None, Some(row)) => (Header::At(row), opts), (Some(column_names), _) => (Header::With(column_names.clone()), opts), }) } pub(crate) fn try_new( table: Table, opts: LoadSheetOrTableOptions, ) -> FastExcelResult { let pagination = Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, table.data())?; let (header, opts) = Self::build_header_and_update_selection(&table, opts)?; let available_columns_info = build_available_columns_info(table.data(), &opts.selected_columns, &header)?; let selected_columns_info = opts .selected_columns .select_columns(available_columns_info)?; let mut excel_table = ExcelTable { name: table.name().to_owned(), sheet_name: table.sheet_name().to_owned(), available_columns: AvailableColumns::Pending, // Empty vec as it'll be replaced selected_columns: Vec::with_capacity(0), table, header, pagination, opts, height: None, total_height: None, width: None, // Will be replaced limit: 0, }; excel_table.limit = excel_table.compute_limit(); let row_limit = get_schema_sample_rows( excel_table.opts.schema_sample_rows, excel_table.offset(), excel_table.limit(), ); // Finalizing column info let selected_columns = finalize_column_info( selected_columns_info, excel_table.data(), excel_table.offset(), row_limit, excel_table.opts.dtypes.as_ref(), &excel_table.opts.dtype_coercion, excel_table.opts.whitespace_as_null, )?; // Figure out dtype for every column excel_table.selected_columns = selected_columns; Ok(excel_table) } pub(crate) fn data(&self) -> &Range { self.table.data() } fn ensure_available_columns_loaded(&mut self) -> FastExcelResult<()> { let available_columns = match &self.available_columns { AvailableColumns::Pending => { let available_columns_info = build_available_columns_info( self.table.data(), &self.opts.selected_columns, &self.header, )?; let final_info = finalize_column_info( available_columns_info, self.data(), self.offset(), self.limit(), self.opts.dtypes.as_ref(), &self.opts.dtype_coercion, self.opts.whitespace_as_null, )?; AvailableColumns::Loaded(final_info) } AvailableColumns::Loaded(_) => return Ok(()), }; self.available_columns = available_columns; Ok(()) } fn load_available_columns(&mut self) -> FastExcelResult<&[ColumnInfo]> { self.ensure_available_columns_loaded()?; self.available_columns.as_loaded() } pub fn offset(&self) -> usize { self.header.offset() + self.pagination.offset() } fn compute_limit(&self) -> usize { let upper_bound = if self.opts.skip_whitespace_tail_rows { height_without_tail_whitespace(self.data()).unwrap_or_else(|| self.data().height()) } else { self.data().height() }; if let Some(n_rows) = self.pagination.n_rows() { let limit = self.offset() + n_rows; if limit < upper_bound { return limit; } } upper_bound } pub fn limit(&self) -> usize { self.limit } pub fn selected_columns(&self) -> Vec { self.selected_columns.clone() } pub fn available_columns(&mut self) -> FastExcelResult> { self.load_available_columns().map(|cols| cols.to_vec()) } pub fn specified_dtypes(&self) -> Option<&DTypes> { self.opts.dtypes.as_ref() } pub fn width(&mut self) -> usize { self.width.unwrap_or_else(|| { let width = self.data().width(); self.width = Some(width); width }) } pub fn height(&mut self) -> usize { self.height.unwrap_or_else(|| { let height = self.limit() - self.offset(); self.height = Some(height); height }) } pub fn total_height(&mut self) -> usize { self.total_height.unwrap_or_else(|| { let total_height = self.data().height() - self.header.offset(); self.total_height = Some(total_height); total_height }) } pub fn name(&self) -> &str { &self.name } pub fn sheet_name(&self) -> &str { &self.sheet_name } pub fn to_columns(&self) -> FastExcelResult> { self.selected_columns .iter() .map(|column_info| { FastExcelColumn::try_from_column_info( column_info, self.table.data(), self.offset(), self.limit(), self.opts.whitespace_as_null, ) }) .collect() } #[cfg(feature = "polars")] pub fn to_polars(&self) -> FastExcelResult { use crate::error::FastExcelErrorKind; let pl_columns = self.to_columns()?.into_iter().map(Into::into).collect(); DataFrame::new_infer_height(pl_columns).map_err(|err| { FastExcelErrorKind::Internal(format!("could not create DataFrame: {err:?}")).into() }) } } ================================================ FILE: src/types/exceltable/python.rs ================================================ use std::sync::Arc; use arrow_array::{RecordBatch, StructArray}; use arrow_schema::Field; #[cfg(feature = "pyarrow")] use pyo3::PyAny; use pyo3::{ Bound, PyResult, Python, pymethods, types::{PyCapsule, PyTuple}, }; use pyo3_arrow::ffi::{to_array_pycapsules, to_schema_pycapsule}; use crate::{ ExcelTable, data::{record_batch_from_data_and_columns_with_skip_rows, selected_columns_to_schema}, error::{ErrorContext, FastExcelError, FastExcelResult, py_errors::IntoPyResult}, types::{dtype::DTypes, excelsheet::column_info::ColumnInfo}, }; impl TryFrom<&ExcelTable> for RecordBatch { type Error = FastExcelError; fn try_from(table: &ExcelTable) -> FastExcelResult { record_batch_from_data_and_columns_with_skip_rows( &table.selected_columns, table.data(), table.pagination.skip_rows(), table.offset(), table.limit(), table.opts.whitespace_as_null, ) .with_context(|| { format!( "could not convert table {table} in sheet {sheet} to RecordBatch", table = &table.name, sheet = &table.sheet_name ) }) } } // NOTE: These proxy python implems are required because `#[getter]` does not play well with `cfg_attr`: // * https://github.com/PyO3/pyo3/issues/1003 // * https://github.com/PyO3/pyo3/issues/780 #[pymethods] impl ExcelTable { #[getter("name")] pub fn py_name(&self) -> &str { &self.name } #[getter("sheet_name")] pub fn py_sheet_name(&self) -> &str { &self.sheet_name } #[getter("offset")] pub fn py_offset(&self) -> usize { self.offset() } #[getter("limit")] pub fn py_limit(&self) -> usize { self.limit() } #[getter("selected_columns")] pub fn py_selected_columns(&self) -> Vec { self.selected_columns() } #[pyo3(name = "available_columns")] pub fn py_available_columns(&mut self) -> FastExcelResult> { self.available_columns() } #[getter("specified_dtypes")] pub fn py_specified_dtypes(&self) -> Option<&DTypes> { self.specified_dtypes() } #[getter("width")] pub fn py_width(&mut self) -> usize { self.width() } #[getter("height")] pub fn py_height(&mut self) -> usize { self.height() } #[getter("total_height")] pub fn py_total_height(&mut self) -> usize { self.total_height() } #[cfg(feature = "pyarrow")] pub fn to_arrow<'py>(&self, py: Python<'py>) -> FastExcelResult> { RecordBatch::try_from(self) .with_context(|| { format!( "could not create RecordBatch from sheet \"{}\"", self.name ) }) .and_then(|rb| { use arrow_pyarrow::ToPyArrow; use crate::error::FastExcelErrorKind; rb.to_pyarrow(py) .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) }) .with_context(|| { format!( "could not convert RecordBatch to pyarrow for table \"{table}\" in sheet \"{sheet}\"", table = self.name, sheet = self.sheet_name ) }) } /// Export the schema as an [`ArrowSchema`] [`PyCapsule`]. /// /// /// /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema /// [`PyCapsule`]: pyo3::types::PyCapsule pub fn __arrow_c_schema__<'py>(&self, py: Python<'py>) -> PyResult> { let schema = selected_columns_to_schema(&self.selected_columns); Ok(to_schema_pycapsule(py, &schema)?) } /// Export the schema and data as a pair of [`ArrowSchema`] and [`ArrowArray`] [`PyCapsules`] /// /// The optional `requested_schema` parameter allows for potential schema conversion. /// /// /// /// [`ArrowSchema`]: arrow_array::ffi::FFI_ArrowSchema /// [`ArrowArray`]: arrow_array::ffi::FFI_ArrowArray /// [`PyCapsules`]: pyo3::types::PyCapsule pub fn __arrow_c_array__<'py>( &self, py: Python<'py>, requested_schema: Option>, ) -> PyResult> { let record_batch = RecordBatch::try_from(self) .with_context(|| format!("could not create RecordBatch from table \"{}\"", self.name)) .into_pyresult()?; let field = Field::new_struct("", record_batch.schema_ref().fields().clone(), false); let array = Arc::new(StructArray::from(record_batch)); Ok(to_array_pycapsules( py, field.into(), array.as_ref(), requested_schema, )?) } pub fn __repr__(&self) -> String { format!( "ExcelTable<{sheet}/{name}>", sheet = self.sheet_name, name = self.name ) } } ================================================ FILE: src/types/idx_or_name/mod.rs ================================================ #[cfg(feature = "python")] mod python; /// A column index or name. #[derive(Debug, PartialEq, Eq, Hash, Clone)] pub enum IdxOrName { Idx(usize), Name(String), } impl IdxOrName { pub(crate) fn format_message(&self) -> String { match self { Self::Idx(idx) => format!("at index {idx}"), Self::Name(name) => format!("with name \"{name}\""), } } } impl From for IdxOrName { fn from(index: usize) -> Self { Self::Idx(index) } } impl From for IdxOrName { fn from(name: String) -> Self { Self::Name(name) } } impl From<&str> for IdxOrName { fn from(name: &str) -> Self { Self::Name(name.to_owned()) } } ================================================ FILE: src/types/idx_or_name/python.rs ================================================ use pyo3::{ Borrowed, Bound, FromPyObject, IntoPyObject, IntoPyObjectExt, PyAny, PyErr, Python, types::PyAnyMethods, }; use crate::{ error::{FastExcelError, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult}, types::idx_or_name::IdxOrName, }; impl TryFrom<&Bound<'_, PyAny>> for IdxOrName { type Error = FastExcelError; fn try_from(value: &Bound<'_, PyAny>) -> FastExcelResult { if let Ok(index) = value.extract() { Ok(Self::Idx(index)) } else if let Ok(name) = value.extract() { Ok(Self::Name(name)) } else { Err(FastExcelErrorKind::InvalidParameters(format!( "cannot create IdxOrName from {value:?}" )) .into()) } } } impl<'a, 'py> FromPyObject<'a, 'py> for IdxOrName { type Error = PyErr; fn extract(ob: Borrowed<'a, 'py, PyAny>) -> Result { (&*ob).try_into().into_pyresult() } } impl<'py> IntoPyObject<'py> for IdxOrName { type Target = PyAny; type Output = Bound<'py, Self::Target>; type Error = pyo3::PyErr; fn into_pyobject(self, py: Python<'py>) -> Result { match self { IdxOrName::Idx(idx) => idx.into_bound_py_any(py), IdxOrName::Name(name) => name.into_bound_py_any(py), } } } impl<'py> IntoPyObject<'py> for &IdxOrName { type Target = PyAny; type Output = Bound<'py, Self::Target>; type Error = pyo3::PyErr; fn into_pyobject(self, py: Python<'py>) -> Result { match self { IdxOrName::Idx(idx) => idx.into_bound_py_any(py), IdxOrName::Name(name) => name.into_bound_py_any(py), } } } ================================================ FILE: src/types/mod.rs ================================================ pub(crate) mod dtype; pub(crate) mod excelreader; pub(crate) mod excelsheet; pub(crate) mod exceltable; pub(crate) mod idx_or_name; pub use dtype::{DType, DTypeCoercion, DTypes}; pub use excelreader::{DefinedName, ExcelReader, LoadSheetOrTableOptions}; pub use excelsheet::{ ExcelSheet, SelectedColumns, SheetVisible, SkipRows, column_info::{ColumnInfo, ColumnNameFrom, DTypeFrom}, }; pub use exceltable::ExcelTable; pub use idx_or_name::IdxOrName; ================================================ FILE: src/utils/mod.rs ================================================ pub(crate) mod schema; ================================================ FILE: src/utils/schema.rs ================================================ use std::cmp::min; /// Determines how many rows should be used for schema sampling, based on the provided parameter, /// and the sheet's offset and limit. /// /// Note that here, the limit should be retrieved from the sheet's `limit()` method, and must not /// be out of the sheet's bounds pub(crate) fn get_schema_sample_rows( sample_rows: Option, offset: usize, limit: usize, ) -> usize { // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is // not provided, we sample limit rows, i.e on the entire column let sample_rows = offset + sample_rows.unwrap_or(limit); // If sample_rows is higher than the sheet's limit, use the limit instead min(sample_rows, limit) } #[cfg(feature = "__pyo3-tests")] #[cfg(test)] mod tests { use super::get_schema_sample_rows; use pretty_assertions::assert_eq; use rstest::rstest; #[rstest] // default value, 50 rows sheet, row limit should be 50 #[case(Some(1000), 0, 50, 50)] // default value, 5000 rows sheet, row limit should be 1000 #[case(Some(1000), 0, 5000, 1000)] // default value, 1500 rows sheet, offset of 1000, row limit should be 1500 #[case(Some(1000), 1000, 1500, 1500)] // 100 sampling size, 1500 rows sheet, offset of 1000, row limit should be 1100 #[case(Some(100), 1000, 1500, 1100)] // No value, 50 rows sheet, row limit should be 50 #[case(None, 0, 50, 50)] // No value, 5000 rows sheet, row limit should be 5000 #[case(None, 0, 5000, 5000)] // no value, 1500 rows sheet, offset of 1000, row limit should be 1500 #[case(None, 1000, 1500, 1500)] fn test_get_schema_sample_rows_return_values( #[case] sample_rows: Option, #[case] offset: usize, #[case] limit: usize, #[case] expected: usize, ) { assert_eq!(get_schema_sample_rows(sample_rows, offset, limit), expected); } } ================================================ FILE: test.py ================================================ #!/usr/bin/env python3 import argparse import fastexcel def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("file") parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use") parser.add_argument( "--eager", action="store_true", help="wether the sheet should be loaded eagerly" ) parser.add_argument( "-i", "--iterations", type=int, help="the number of iterations to do", default=1 ) parser.add_argument("-t", "--table", type=str, help="the name of the table to load") parser.add_argument( "--print-tables", action="store_true", help="whether to print the tables in the file" ) return parser.parse_args() def main(): args = get_args() excel_file = fastexcel.read_excel(args.file) use_columns = args.column or None if args.print_tables: table_names = excel_file.table_names() if len(table_names) > 0: print(f"Available tables are {', '.join(table_names)}") else: print("No tables found") for _ in range(args.iterations): if args.table: tbl = excel_file.load_table(args.table) print(f"Found table {args.table}:") print(tbl.to_polars()) else: for sheet_name in excel_file.sheet_names: if args.eager: excel_file.load_sheet_eager(sheet_name, use_columns=use_columns) else: excel_file.load_sheet(sheet_name, use_columns=use_columns).to_arrow() if __name__ == "__main__": main() ================================================ FILE: tests/column_selection.rs ================================================ use anyhow::{Context, Result}; use fastexcel::{DType, DTypes, IdxOrName, LoadSheetOrTableOptions, SelectedColumns}; use pretty_assertions::assert_eq; use rstest::{fixture, rstest}; use std::collections::HashMap; use crate::utils::path_for_fixture; #[macro_use] mod utils; #[fixture] fn reader() -> fastexcel::ExcelReader { fastexcel::read_excel(path_for_fixture("sheet-with-tables.xlsx")) .expect("could not read excel file") } #[rstest] fn test_use_columns_with_table(mut reader: fastexcel::ExcelReader) -> Result<()> { let selected_columns = SelectedColumns::Selection(vec![ IdxOrName::Name("User Id".to_string()), IdxOrName::Name("FirstName".to_string()), ]); let opts = LoadSheetOrTableOptions::new_for_table().selected_columns(selected_columns); let mut table = reader .load_table("users", opts) .context("Failed to load table")?; assert_eq!(table.name(), "users"); assert_eq!(table.width(), 4); assert_eq!(table.height(), 3); let available_columns = table .available_columns() .context("could not obtain available columns for table")?; let expected_available_columns = vec![ fastexcel::ColumnInfo { name: "User Id".into(), index: 0, absolute_index: 0, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "FirstName".into(), index: 1, absolute_index: 1, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "__UNNAMED__2".into(), index: 2, absolute_index: 2, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Generated, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "__UNNAMED__3".into(), index: 3, absolute_index: 3, dtype: fastexcel::DType::DateTime, column_name_from: fastexcel::ColumnNameFrom::Generated, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(available_columns, expected_available_columns); let selected_columns_info = table.selected_columns(); let expected_selected_columns = vec![ fastexcel::ColumnInfo { name: "User Id".into(), index: 0, absolute_index: 0, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "FirstName".into(), index: 1, absolute_index: 1, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(selected_columns_info, expected_selected_columns); let expected_columns = fe_columns!( "User Id" => [1.0, 2.0, 5.0], "FirstName" => ["Peter", "John", "Hans"], ); let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "User Id" => [1.0, 2.0, 5.0], "FirstName" => ["Peter", "John", "Hans"], )?; let df = table .to_polars() .context("could not convert table to polars dataframe")?; assert!(df.equals_missing(&expected_df)) } Ok(()) } #[rstest] fn test_use_columns_with_table_and_provided_columns( mut reader: fastexcel::ExcelReader, ) -> Result<()> { let selected_columns = SelectedColumns::Selection(vec![0.into(), 2.into()]); let opts = LoadSheetOrTableOptions::new_for_table() .column_names(vec!["user_id", "last_name"]) .selected_columns(selected_columns); let mut table = reader .load_table("users", opts) .context("Failed to load table")?; assert_eq!(table.name(), "users"); assert_eq!(table.width(), 4); assert_eq!(table.height(), 3); let available_columns = table .available_columns() .context("could not obtain available columns for table")?; let expected_available_columns = vec![ fastexcel::ColumnInfo { name: "user_id".into(), index: 0, absolute_index: 0, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "__UNNAMED__1".into(), index: 1, absolute_index: 1, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Generated, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "last_name".into(), index: 2, absolute_index: 2, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "__UNNAMED__3".into(), index: 3, absolute_index: 3, dtype: fastexcel::DType::DateTime, column_name_from: fastexcel::ColumnNameFrom::Generated, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(available_columns, expected_available_columns); let selected_columns_info = table.selected_columns(); let expected_selected_columns = vec![ fastexcel::ColumnInfo { name: "user_id".into(), index: 0, absolute_index: 0, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "last_name".into(), index: 2, absolute_index: 2, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(selected_columns_info, expected_selected_columns); let expected_columns = fe_columns!( "user_id" => [1.0, 2.0, 5.0], "last_name" => ["Müller", "Meier", "Fricker"], ); let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "user_id" => [1.0, 2.0, 5.0], "last_name" => ["Müller", "Meier", "Fricker"], )?; let df = table .to_polars() .context("could not convert table to polars dataframe")?; assert!(df.equals_missing(&expected_df)) } Ok(()) } #[fixture] fn reader_with_offset() -> fastexcel::ExcelReader { fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) .expect("could not read excel file") } #[rstest] fn test_use_column_range_with_offset_with_table_and_specified_dtypes( mut reader_with_offset: fastexcel::ExcelReader, ) -> Result<()> { let dtypes_map: HashMap = [ (IdxOrName::Idx(3), DType::Int), (IdxOrName::Name("Column at E5".to_owned()), DType::String), ] .into_iter() .collect(); let selected_columns_closed = "D:E" .parse::() .context("could not parse column selection")?; let opts_closed_range = LoadSheetOrTableOptions::new_for_table() .selected_columns(selected_columns_closed) .with_dtypes(DTypes::Map(dtypes_map.clone())); let table_closed = reader_with_offset .load_table("TableAtD5", opts_closed_range) .context("Failed to load table with closed range")?; let selected_columns_open_ended = "D:" .parse::() .context("could not parse column selection")?; let opts_open_ended_range = LoadSheetOrTableOptions::new_for_table() .selected_columns(selected_columns_open_ended) .with_dtypes(DTypes::Map(dtypes_map.clone())); let table_open_ended = reader_with_offset .load_table("TableAtD5", opts_open_ended_range) .context("Failed to load table with open-ended range")?; assert_eq!(table_closed.name(), "TableAtD5"); assert_eq!(table_open_ended.name(), "TableAtD5"); let expected_selected_columns = vec![ fastexcel::ColumnInfo { name: "Column at D5".to_owned(), index: 0, absolute_index: 3, dtype: fastexcel::DType::Int, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::ProvidedByIndex, }, fastexcel::ColumnInfo { name: "Column at E5".to_owned(), index: 1, absolute_index: 4, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::ProvidedByName, }, ]; assert_eq!(table_closed.selected_columns(), expected_selected_columns); assert_eq!( table_open_ended.selected_columns(), expected_selected_columns ); let expected_columns = fe_columns!( "Column at D5" => [1_i64, 2, 3, 4], "Column at E5" => ["4", "5", "6", "8"], ); assert_eq!( table_closed .to_columns() .context("could not convert table to columns")?, expected_columns ); assert_eq!( table_open_ended .to_columns() .context("could not convert table to columns")?, expected_columns ); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "Column at D5" => [1_i64, 2, 3, 4], "Column at E5" => ["4", "5", "6", "8"], )?; let df_closed = table_closed .to_polars() .context("could not convert table to polars dataframe")?; let df_open_ended = table_open_ended .to_polars() .context("could not convert table to polars dataframe")?; assert!(df_closed.equals_missing(&expected_df)); assert!(df_open_ended.equals_missing(&expected_df)); } Ok(()) } /// This test ensures that index-based selection is correctly resolved when used with an offset /// table: the selected indices should be absolute, and it should be able to handle both index-based /// and name-based selection. #[rstest] fn test_use_column_names_with_offset_table_by_index_and_name( mut reader_with_offset: fastexcel::ExcelReader, ) -> Result<()> { let selected_columns = SelectedColumns::Selection(vec![ IdxOrName::Name("Column at D5".to_string()), IdxOrName::Idx(4), ]); let opts = LoadSheetOrTableOptions::new_for_table().selected_columns(selected_columns); let table = reader_with_offset .load_table("TableAtD5", opts) .context("Failed to load table")?; assert_eq!(table.name(), "TableAtD5"); let expected_selected_columns = vec![ fastexcel::ColumnInfo { name: "Column at D5".to_owned(), index: 0, absolute_index: 3, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "Column at E5".to_owned(), index: 1, absolute_index: 4, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; let selected_columns_info = table.selected_columns(); assert_eq!(selected_columns_info, expected_selected_columns); let expected_columns = fe_columns!( "Column at D5" => [1.0, 2.0, 3.0, 4.0], "Column at E5" => [4.0, 5.0, 6.0, 8.0], ); let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "Column at D5" => [1.0, 2.0, 3.0, 4.0], "Column at E5" => [4.0, 5.0, 6.0, 8.0], )?; let df = table .to_polars() .context("could not convert table to polars dataframe")?; assert!(df.equals_missing(&expected_df)) } Ok(()) } #[rstest] fn test_use_column_range_with_offset_with_sheet_and_specified_dtypes( mut reader_with_offset: fastexcel::ExcelReader, ) -> Result<()> { // Create dtypes map: {7: "int", "Column at I10": "string"} // Note: Column H is at index 7, Column I is at index 8, Column K is at index 10 let dtypes_map: HashMap = [ (IdxOrName::Idx(7), DType::Int), (IdxOrName::Name("Column at I10".to_owned()), DType::String), ] .into_iter() .collect(); let selected_columns_closed = "H:K" .parse::() .context("could not parse column selection")?; let opts_closed_range = LoadSheetOrTableOptions::new_for_sheet() .header_row(9) .selected_columns(selected_columns_closed) .with_dtypes(DTypes::Map(dtypes_map.clone())); let sheet_closed = reader_with_offset .load_sheet("without-table".into(), opts_closed_range) .context("Failed to load sheet with closed range")?; let selected_columns_open_ended = "H:" .parse::() .context("could not parse column selection")?; let opts_open_ended_range = LoadSheetOrTableOptions::new_for_sheet() .header_row(9) .selected_columns(selected_columns_open_ended) .with_dtypes(DTypes::Map(dtypes_map.clone())); let sheet_open_ended = reader_with_offset .load_sheet("without-table".into(), opts_open_ended_range) .context("Failed to load sheet with open-ended range")?; assert_eq!(sheet_closed.name(), "without-table"); assert_eq!(sheet_open_ended.name(), "without-table"); let expected_selected_columns = vec![ fastexcel::ColumnInfo { name: "Column at H10".to_owned(), index: 0, absolute_index: 7, dtype: fastexcel::DType::Int, column_name_from: fastexcel::ColumnNameFrom::LookedUp, dtype_from: fastexcel::DTypeFrom::ProvidedByIndex, }, fastexcel::ColumnInfo { name: "Column at I10".to_owned(), index: 1, absolute_index: 8, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::LookedUp, dtype_from: fastexcel::DTypeFrom::ProvidedByName, }, fastexcel::ColumnInfo { name: "__UNNAMED__2".to_owned(), index: 2, absolute_index: 9, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Generated, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "Column at K10".to_owned(), index: 3, absolute_index: 10, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::LookedUp, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(sheet_closed.selected_columns(), &expected_selected_columns); assert_eq!( sheet_open_ended.selected_columns(), &expected_selected_columns ); let expected_columns = fe_columns!( "Column at H10" => [1_i64, 2, 3], "Column at I10" => ["4", "5", "6"], "__UNNAMED__2" => [Option::<&str>::None, None, None], "Column at K10" => [7.0, 8.0, 9.0], ); assert_eq!( sheet_closed .to_columns() .context("could not convert sheet to columns")?, expected_columns ); assert_eq!( sheet_open_ended .to_columns() .context("could not convert sheet to columns")?, expected_columns ); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "Column at H10" => [1_i64, 2, 3], "Column at I10" => ["4", "5", "6"], "__UNNAMED__2" => [Option::<&str>::None, None, None], "Column at K10" => [7.0, 8.0, 9.0], )?; let df_closed = sheet_closed .to_polars() .context("could not convert sheet to polars dataframe")?; let df_open_ended = sheet_open_ended .to_polars() .context("could not convert sheet to polars dataframe")?; assert!(df_closed.equals_missing(&expected_df)); assert!(df_open_ended.equals_missing(&expected_df)); } Ok(()) } ================================================ FILE: tests/fastexcel.rs ================================================ #[macro_use] mod utils; use anyhow::{Context, Result}; use chrono::NaiveDate; use fastexcel::{FastExcelColumn, LoadSheetOrTableOptions, SkipRows}; #[cfg(feature = "polars")] use polars_core::{df, frame::DataFrame}; use pretty_assertions::assert_eq; use rstest::rstest; use utils::path_for_fixture; #[test] fn test_single_sheet() -> Result<()> { let mut reader = fastexcel::read_excel(path_for_fixture("fixture-single-sheet.xlsx")) .context("could not read excel file")?; assert_eq!(reader.sheet_names(), vec!["January"]); let mut sheet_by_name = reader .load_sheet("January".into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet by name")?; let mut sheet_by_idx = reader .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet by index")?; assert_eq!(sheet_by_name.name(), sheet_by_idx.name()); assert_eq!(sheet_by_name.name(), "January"); assert_eq!(sheet_by_name.height(), sheet_by_idx.height()); assert_eq!(sheet_by_name.height(), 2); assert_eq!(sheet_by_name.width(), sheet_by_idx.width()); assert_eq!(sheet_by_name.width(), 2); let columns_by_name = sheet_by_name .to_columns() .context("could not convert sheet by name to columns")?; let columns_by_idx = sheet_by_idx .to_columns() .context("could not convert sheet by index to columns")?; assert_eq!(&columns_by_name, &columns_by_idx); let expected_columns = fe_columns!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0], ); assert_eq!(&columns_by_name, &expected_columns); #[cfg(feature = "polars")] { let df_by_name = sheet_by_name .to_polars() .context("could not convert sheet by name to DataFrame")?; let df_by_idx = sheet_by_idx .to_polars() .context("could not convert sheet by index to DataFrame")?; let expected_df = df!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0] ) .context("could not create expected DataFrame")?; assert_eq!(&df_by_name, &df_by_idx); assert!(df_by_name.equals_missing(&expected_df)); } Ok(()) } #[test] fn test_single_sheet_bytes() -> Result<()> { let bytes = std::fs::read(path_for_fixture("fixture-single-sheet.xlsx"))?; let mut reader = fastexcel::ExcelReader::try_from(bytes.as_slice()) .context("could not create reader from bytes")?; assert_eq!(reader.sheet_names(), vec!["January"]); let mut sheet_by_name = reader .load_sheet("January".into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet by name")?; let mut sheet_by_idx = reader .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet by index")?; assert_eq!(sheet_by_name.name(), sheet_by_idx.name()); assert_eq!(sheet_by_name.name(), "January"); assert_eq!(sheet_by_name.height(), sheet_by_idx.height()); assert_eq!(sheet_by_name.height(), 2); assert_eq!(sheet_by_name.width(), sheet_by_idx.width()); assert_eq!(sheet_by_name.width(), 2); let columns_by_name = sheet_by_name .to_columns() .context("could not convert sheet by name to columns")?; let columns_by_idx = sheet_by_idx .to_columns() .context("could not convert sheet by index to columns")?; assert_eq!(&columns_by_name, &columns_by_idx); let expected_columns = fe_columns!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0] ); assert_eq!(&columns_by_name, &expected_columns); #[cfg(feature = "polars")] { let df_by_name = sheet_by_name .to_polars() .context("could not convert sheet by name to DataFrame")?; let df_by_idx = sheet_by_idx .to_polars() .context("could not convert sheet by index to DataFrame")?; let expected_df = df!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0] ) .context("could not create expected DataFrame")?; assert_eq!(&df_by_name, &df_by_idx); assert!(df_by_name.equals_missing(&expected_df)); } Ok(()) } #[test] fn test_single_sheet_with_types() -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) .context("could not read excel file")?; let mut sheet = excel_reader .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet")?; assert_eq!(sheet.name(), "Sheet1"); assert_eq!(sheet.height(), sheet.total_height()); assert_eq!(sheet.height(), 3); assert_eq!(sheet.width(), 4); let columns = sheet .to_columns() .context("could not convert sheet by name to columns")?; let naive_date = NaiveDate::from_ymd_opt(2022, 3, 2) .unwrap() .and_hms_opt(5, 43, 4) .unwrap(); let expected_columns = fe_columns!( "__UNNAMED__0" => [0.0, 1.0, 2.0], "bools" => [true, false, true], "dates" => [naive_date; 3], "floats" => [12.35, 42.69, 1234567.0], ); assert_eq!(&columns, &expected_columns); #[cfg(feature = "polars")] { let df = sheet .to_polars() .context("could not convert sheet to DataFrame")?; let expected_df = df!( "__UNNAMED__0" => [0.0, 1.0, 2.0], "bools" => [true, false, true], "dates" => [naive_date; 3], "floats" => [12.35, 42.69, 1234567.0], ) .context("could not create expected DataFrame")?; assert!(df.equals_missing(&expected_df)); } Ok(()) } #[test] fn test_multiple_sheets() -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) .context("could not read excel file")?; let sheet_0 = excel_reader .load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet 0 by idx")?; let expected_columns_sheet_0 = fe_columns!("Month" => [1.0], "Year" => [2019.0]); let sheet_0_columns = sheet_0 .to_columns() .context("could not convert sheet 0 to columns")?; assert_eq!(sheet_0_columns, expected_columns_sheet_0); let sheet_1 = excel_reader .load_sheet(1.into(), LoadSheetOrTableOptions::new_for_sheet()) .context("could not load sheet 1 by idx")?; let expected_columns_sheet_1 = fe_columns!("Month" => [2.0, 3.0, 4.0], "Year" => [2019.0, 2021.0, 2022.0]); let sheet_1_columns = sheet_1 .to_columns() .context("could not convert sheet 1 to columns")?; assert_eq!(sheet_1_columns, expected_columns_sheet_1); let sheet_unnamed_columns = excel_reader .load_sheet( "With unnamed columns".into(), LoadSheetOrTableOptions::new_for_sheet(), ) .context("could not load sheet \"With unnamed columns\" by idx")?; let expected_columns_sheet_unnamed_columns = fe_columns!( "col1" => [2.0, 3.0], "__UNNAMED__1" => [1.5, 2.5], "col3" => ["hello", "world"], "__UNNAMED__3" => [-5.0, -6.0], "col5" => ["a", "b"], ); let sheet_unnamed_columns_columns = sheet_unnamed_columns .to_columns() .context("could not convert sheet \"With unnamed columns\" to columns")?; assert_eq!( sheet_unnamed_columns_columns, expected_columns_sheet_unnamed_columns ); #[cfg(feature = "polars")] { let expected_df_sheet_0 = df!("Month" => [1.0], "Year" => [2019.0])?; let df_sheet_0 = sheet_0 .to_polars() .context("could not convert sheet 0 to DataFrame")?; assert!(expected_df_sheet_0.equals_missing(&df_sheet_0)); let expected_df_sheet_1 = df!("Month" => [2.0, 3.0, 4.0], "Year" => [2019.0, 2021.0, 2022.0])?; let df_sheet_1 = sheet_1 .to_polars() .context("could not convert sheet 1 to DataFrame")?; assert!(expected_df_sheet_1.equals_missing(&df_sheet_1)); let expected_df_sheet_unnamed_columns = df!( "col1" => [2.0, 3.0], "__UNNAMED__1" => [1.5, 2.5], "col3" => ["hello", "world"], "__UNNAMED__3" => [-5.0, -6.0], "col5" => ["a", "b"], )?; let df_sheet_unnamed_columns = sheet_unnamed_columns .to_polars() .context("could not convert sheet \"With unnamed columns\" to DataFrame")?; assert!(expected_df_sheet_unnamed_columns.equals_missing(&df_sheet_unnamed_columns)); } Ok(()) } #[test] fn test_sheet_with_header_row_diff_from_zero() -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("fixture-changing-header-location.xlsx")) .context("could not read excel file")?; assert_eq!( excel_reader.sheet_names(), vec!["Sheet1", "Sheet2", "Sheet3"] ); let mut sheet_by_name = excel_reader .load_sheet( "Sheet1".into(), LoadSheetOrTableOptions::new_for_sheet().header_row(1), ) .context("could not load sheet \"Sheet1\" by name")?; let mut sheet_by_idx = excel_reader .load_sheet( 0.into(), LoadSheetOrTableOptions::new_for_sheet().header_row(1), ) .context("could not load sheet 0 by index")?; assert_eq!(sheet_by_name.name(), sheet_by_idx.name()); assert_eq!(sheet_by_name.name(), "Sheet1"); assert_eq!(sheet_by_name.height(), sheet_by_idx.height()); assert_eq!(sheet_by_name.height(), 2); assert_eq!(sheet_by_name.width(), sheet_by_idx.width()); assert_eq!(sheet_by_name.width(), 2); let expected_columns = fe_columns!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0] ); let columns_by_name = sheet_by_name .to_columns() .context("could not convert sheet \"Sheet1\" to columns")?; let columns_by_idx = sheet_by_idx .to_columns() .context("could not convert sheet 0 to columns")?; assert_eq!(&columns_by_name, &columns_by_idx); assert_eq!(&columns_by_name, &expected_columns); #[cfg(feature = "polars")] { let df_by_name = sheet_by_name .to_polars() .context("could not convert sheet \"Sheet1\" to DataFrame")?; let df_by_idx = sheet_by_idx .to_polars() .context("could not convert sheet 0 to DataFrame")?; let expected_df = df!( "Month" => [1.0, 2.0], "Year" => [2019.0, 2020.0] )?; assert!(df_by_name.equals_missing(&df_by_idx)); assert!(expected_df.equals_missing(&df_by_name)); } Ok(()) } #[test] fn test_sheet_with_pagination_and_without_headers() -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx")) .context("could not read excel file")?; let opts = LoadSheetOrTableOptions::new_for_sheet() .n_rows(1) .skip_rows(SkipRows::Simple(1)) .no_header_row() .column_names(["This", "Is", "Amazing", "Stuff"]); let mut sheet = excel_reader .load_sheet(0.into(), opts) .context("could not load sheet 0")?; assert_eq!(sheet.name(), "Sheet1"); assert_eq!(sheet.height(), 1); assert_eq!(sheet.width(), 4); let naive_dt = NaiveDate::from_ymd_opt(2022, 3, 2) .unwrap() .and_hms_opt(5, 43, 4) .unwrap(); let expected_columns = fe_columns!( "This" => [0.0], "Is" => [true], "Amazing" => [naive_dt], "Stuff" => [12.35], ); let sheet_columns = sheet .to_columns() .context("could not convert sheet to columns")?; assert_eq!(&sheet_columns, &expected_columns); #[cfg(feature = "polars")] { let df = sheet .to_polars() .context("could not convert sheet to DataFrame")?; let expected_df = df!( "This" => [0.0], "Is" => [true], "Amazing" => [naive_dt], "Stuff" => [12.35], )?; assert!(df.equals_missing(&expected_df)); } Ok(()) } #[rstest] #[case(Some(0), SkipRows::SkipEmptyRowsAtBeginning, fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0]))] #[case( None, SkipRows::Simple(0), fe_columns!( "__UNNAMED__0" => [None, None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [None, None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ) )] #[case( None, SkipRows::SkipEmptyRowsAtBeginning, fe_columns!( "__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] ) )] #[case( Some(0), SkipRows::Simple(0), fe_columns!( "__UNNAMED__0" => [None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ) )] #[case( Some(0), SkipRows::Simple(1), fe_columns!( "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ) )] #[case( None, SkipRows::Simple(2), fe_columns!( "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ) )] #[case( None, SkipRows::Simple(3), fe_columns!( "__UNNAMED__0" => [Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] ) )] #[case( Some(1), SkipRows::Simple(0), fe_columns!("__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]) )] #[case(Some(2), SkipRows::Simple(0), fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0]))] #[case( Some(2), SkipRows::SkipEmptyRowsAtBeginning, fe_columns!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0]) )] fn test_header_row_and_skip_rows( #[case] header_row: Option, #[case] skip_rows: SkipRows, #[case] expected: Vec, ) -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("no-header.xlsx")) .context("could not read excel file")?; let mut opts = LoadSheetOrTableOptions::new_for_sheet(); opts.header_row = header_row; opts.skip_rows = skip_rows; let sheet = excel_reader .load_sheet(0.into(), opts) .context("could not load sheet 0")?; let sheet_columns = sheet .to_columns() .context("could not convert sheet to columns")?; assert_eq!(&sheet_columns, &expected); Ok(()) } #[cfg(feature = "polars")] #[rstest] #[case(Some(0), SkipRows::SkipEmptyRowsAtBeginning, df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])?)] #[case( None, SkipRows::Simple(0), df!( "__UNNAMED__0" => [None, None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [None, None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] )? )] #[case( None, SkipRows::SkipEmptyRowsAtBeginning, df!( "__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] )? )] #[case( Some(0), SkipRows::Simple(0), df!( "__UNNAMED__0" => [None, Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [None, Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] )? )] #[case( Some(0), SkipRows::Simple(1), df!( "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] )? )] #[case( None, SkipRows::Simple(2), df!( "__UNNAMED__0" => [Some("a"), Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(0.0), Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] )? )] #[case( None, SkipRows::Simple(3), df!( "__UNNAMED__0" => [Some("b"), Some("c"), Some("d"), Some("e"), Some("f")], "__UNNAMED__1" => [Some(1.0), Some(2.0), Some(3.0), Some(4.0), Some(5.0)] )? )] #[case( Some(1), SkipRows::Simple(0), df!("__UNNAMED__0" => ["a", "b", "c", "d", "e", "f"], "__UNNAMED__1" => [0.0, 1.0, 2.0, 3.0, 4.0, 5.0])? )] #[case(Some(2), SkipRows::Simple(0), df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])?)] #[case( Some(2), SkipRows::SkipEmptyRowsAtBeginning, df!("a" => ["b", "c", "d", "e", "f"], "0" => [1.0, 2.0, 3.0, 4.0, 5.0])? )] fn test_header_row_and_skip_rows_polars( #[case] header_row: Option, #[case] skip_rows: SkipRows, #[case] expected: DataFrame, ) -> Result<()> { let mut excel_reader = fastexcel::read_excel(path_for_fixture("no-header.xlsx")) .context("could not read excel file")?; let mut opts = LoadSheetOrTableOptions::new_for_sheet(); opts.header_row = header_row; opts.skip_rows = skip_rows; let sheet = excel_reader .load_sheet(0.into(), opts) .context("could not load sheet 0")?; let df = sheet .to_polars() .context("could not convert sheet to DataFrame")?; assert!(df.equals_missing(&expected)); Ok(()) } ================================================ FILE: tests/sheet_visibility.rs ================================================ #[allow(unused_macros)] mod utils; use anyhow::{Context, Result}; use fastexcel::{LoadSheetOrTableOptions, SheetVisible}; use pretty_assertions::assert_matches; use crate::utils::path_for_fixture; #[test] fn sheet_visibility() -> Result<()> { let mut reader = fastexcel::read_excel(path_for_fixture( "fixture-sheets-different-visibilities.xlsx", )) .context("could not read excel file")?; let sheet_0 = reader.load_sheet(0.into(), LoadSheetOrTableOptions::new_for_sheet())?; let sheet_1 = reader.load_sheet(1.into(), LoadSheetOrTableOptions::new_for_sheet())?; let sheet_2 = reader.load_sheet(2.into(), LoadSheetOrTableOptions::new_for_sheet())?; assert_matches!(sheet_0.visible(), SheetVisible::Visible); assert_matches!(sheet_1.visible(), SheetVisible::Hidden); assert_matches!(sheet_2.visible(), SheetVisible::VeryHidden); Ok(()) } ================================================ FILE: tests/shifted_data.rs ================================================ #[allow(unused_macros)] mod utils; use anyhow::{Context, Result}; use fastexcel::LoadSheetOrTableOptions; use pretty_assertions::assert_eq; use utils::path_for_fixture; #[test] fn test_sheet_with_offset() -> Result<()> { let mut reader = fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) .context("could not read the excel file")?; let mut sheet = reader .load_sheet( "without-table".into(), LoadSheetOrTableOptions::new_for_sheet(), ) .context("could not load sheet \"without-table\"")?; let available_columns = sheet .available_columns() .context("could not obtain available columns for sheet")?; let expected_column_info = vec![ fastexcel::ColumnInfo { name: "Column at H10".into(), index: 0, absolute_index: 7, dtype: fastexcel::DType::Float, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::LookedUp, }, fastexcel::ColumnInfo { name: "Column at I10".into(), index: 1, absolute_index: 8, dtype: fastexcel::DType::Float, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::LookedUp, }, fastexcel::ColumnInfo { name: "__UNNAMED__2".into(), index: 2, absolute_index: 9, dtype: fastexcel::DType::String, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::Generated, }, fastexcel::ColumnInfo { name: "Column at K10".into(), index: 3, absolute_index: 10, dtype: fastexcel::DType::Float, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::LookedUp, }, ]; assert_eq!(available_columns, expected_column_info); Ok(()) } #[test] fn test_table_with_offset() -> Result<()> { let mut reader = fastexcel::read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx")) .context("could not read the excel file")?; let mut table = reader .load_table("TableAtD5", LoadSheetOrTableOptions::new_for_table()) .context("could not load table \"TableAtD5\"")?; let available_columns = table .available_columns() .context("could not obtain available columns for table")?; let expected_column_info = vec![ fastexcel::ColumnInfo { name: "Column at D5".into(), index: 0, absolute_index: 3, dtype: fastexcel::DType::Float, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::Provided, }, fastexcel::ColumnInfo { name: "Column at E5".into(), index: 1, absolute_index: 4, dtype: fastexcel::DType::Float, dtype_from: fastexcel::DTypeFrom::Guessed, column_name_from: fastexcel::ColumnNameFrom::Provided, }, ]; assert_eq!(available_columns, expected_column_info); Ok(()) } ================================================ FILE: tests/tables.rs ================================================ use anyhow::{Context, Result}; use chrono::NaiveDate; use fastexcel::LoadSheetOrTableOptions; use pretty_assertions::assert_eq; use rstest::{fixture, rstest}; use crate::utils::path_for_fixture; #[macro_use] mod utils; #[fixture] fn reader() -> fastexcel::ExcelReader { fastexcel::read_excel(path_for_fixture("sheet-with-tables.xlsx")) .expect("could not read excel file") } #[rstest] #[case::all_sheets(None, vec!["users"])] #[case::sheet_with_tables(Some("sheet1"), vec!["users"])] #[case::sheet_without_tables(Some("sheet2"), vec![])] fn test_table_names( mut reader: fastexcel::ExcelReader, #[case] sheet_name: Option<&str>, #[case] expected: Vec<&str>, ) -> Result<()> { let table_names = reader .table_names(sheet_name) .context("Failed to get table names")?; assert_eq!(table_names, expected); Ok(()) } #[rstest] fn test_load_table(mut reader: fastexcel::ExcelReader) -> Result<()> { let mut table = reader .load_table("users", LoadSheetOrTableOptions::new_for_table()) .context("Failed to load table")?; assert_eq!(table.name(), "users"); assert_eq!(table.sheet_name(), "sheet1"); assert!(table.specified_dtypes().is_none()); assert_eq!(table.total_height(), 3); assert_eq!(table.offset(), 0); assert_eq!(table.height(), 3); assert_eq!(table.width(), 4); let available_columns = table .available_columns() .context("could not obtain available columns for table")?; let expected_column_info = vec![ fastexcel::ColumnInfo { name: "User Id".into(), index: 0, absolute_index: 0, dtype: fastexcel::DType::Float, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "FirstName".into(), index: 1, absolute_index: 1, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "LastName".into(), index: 2, absolute_index: 2, dtype: fastexcel::DType::String, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, fastexcel::ColumnInfo { name: "Date".into(), index: 3, absolute_index: 3, dtype: fastexcel::DType::DateTime, column_name_from: fastexcel::ColumnNameFrom::Provided, dtype_from: fastexcel::DTypeFrom::Guessed, }, ]; assert_eq!(available_columns, expected_column_info); let dates = [ NaiveDate::from_ymd_opt(2020, 1, 1) .unwrap() .and_hms_opt(0, 0, 0) .unwrap(), NaiveDate::from_ymd_opt(2024, 5, 4) .unwrap() .and_hms_opt(0, 0, 0) .unwrap(), NaiveDate::from_ymd_opt(2025, 2, 1) .unwrap() .and_hms_opt(0, 0, 0) .unwrap(), ]; let expected_columns = fe_columns!( "User Id" => [1.0, 2.0, 5.0], "FirstName" => ["Peter", "John", "Hans"], "LastName" => ["Müller", "Meier", "Fricker"], "Date" => dates.as_slice(), ); let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns); #[cfg(feature = "polars")] { use polars_core::df; let expected_df = df!( "User Id" => [1.0, 2.0, 5.0], "FirstName" => ["Peter", "John", "Hans"], "LastName" => ["Müller", "Meier", "Fricker"], "Date" => dates.as_slice(), )?; let df = table .to_polars() .context("could not convert table to polars dataframe")?; assert!(df.equals_missing(&expected_df)) } Ok(()) } ================================================ FILE: tests/utils/mod.rs ================================================ pub fn path_for_fixture(fixture_file: &str) -> String { format!( "{}/tests/fixtures/{}", env!("CARGO_MANIFEST_DIR"), fixture_file ) } macro_rules! fe_column { ($name:expr, $vec_or_arr:expr) => { fastexcel::FastExcelColumn::try_new($name.into(), $vec_or_arr.into(), None) .context("Failed to create column") }; } macro_rules! fe_columns { // (name => []) Any number of times but at least once, optionally followed by a comma ($($name:expr => $vec_or_arr:expr),+ $(,)?) => { vec![ $(fe_column!($name, $vec_or_arr)?),+ ] }; } ================================================ FILE: tests/whitespace.rs ================================================ #[macro_use] mod utils; use anyhow::{Context, Result}; use chrono::{NaiveDate, NaiveDateTime}; use fastexcel::{ExcelReader, LoadSheetOrTableOptions}; use pretty_assertions::assert_eq; use rstest::{fixture, rstest}; use crate::utils::path_for_fixture; #[fixture] fn reader() -> ExcelReader { fastexcel::read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx")) .expect("could not read fixture") } const DATES: &[Option] = &[ Some( NaiveDate::from_ymd_opt(2025, 11, 19) .unwrap() .and_hms_opt(14, 34, 2) .unwrap(), ), Some( NaiveDate::from_ymd_opt(2025, 11, 20) .unwrap() .and_hms_opt(14, 56, 34) .unwrap(), ), Some( NaiveDate::from_ymd_opt(2025, 11, 21) .unwrap() .and_hms_opt(15, 19, 6) .unwrap(), ), None, Some( NaiveDate::from_ymd_opt(2025, 11, 22) .unwrap() .and_hms_opt(15, 41, 38) .unwrap(), ), Some( NaiveDate::from_ymd_opt(2025, 11, 23) .unwrap() .and_hms_opt(16, 4, 10) .unwrap(), ), None, None, None, None, ]; #[rstest] fn test_skip_tail_rows_behavior(mut reader: ExcelReader) -> Result<()> { let expected_columns_with_whitespace = fe_columns!( // String because the last row contains a space "Column One" => [Some("1"), Some("2"), Some("3"), None, Some("5"), None, None, None, None, Some(" ")], "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None, None, Some(""), None, None], "Column Three" => DATES, ); let expected_columns_without_whitespace = fe_columns!( // Not string rows -> float "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None], "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None], "Column Three" => &DATES[0..6], ); let sheet = reader .load_sheet( "Without Table".into(), LoadSheetOrTableOptions::new_for_sheet(), ) .context(r#"could not load sheet "Without Table""#)?; let sheet_columns = sheet .to_columns() .context("could not convert sheet to columns")?; assert_eq!(sheet_columns, expected_columns_with_whitespace); let table = reader .load_table( "Table_with_whitespace", LoadSheetOrTableOptions::new_for_table(), ) .context(r#"could not load table "Table_with_whitespace""#)?; let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns_with_whitespace); let sheet_without_tail_whitespace = reader .load_sheet( "Without Table".into(), LoadSheetOrTableOptions::new_for_sheet().skip_whitespace_tail_rows(true), ) .context(r#"could not load sheet "Without Table""#)?; let sheet_without_tail_whitespace_columns = sheet_without_tail_whitespace .to_columns() .context("could not convert sheet to columns")?; assert_eq!( sheet_without_tail_whitespace_columns, expected_columns_without_whitespace ); let table_without_tail_whitespace = reader .load_table( "Table_with_whitespace", LoadSheetOrTableOptions::new_for_table().skip_whitespace_tail_rows(true), ) .context(r#"could not load table "Table_with_whitespace""#)?; let table_columns_without_tail_whitespace = table_without_tail_whitespace .to_columns() .context("could not convert table to columns")?; assert_eq!( table_columns_without_tail_whitespace, expected_columns_without_whitespace ); Ok(()) } #[rstest] fn test_skip_tail_rows_and_whitespace_as_null_behavior(mut reader: ExcelReader) -> Result<()> { let expected_columns_with_whitespace_as_null = fe_columns!( // All rows should be taken into account but the space in the last row should be considered null "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None, None, None, None, None], // All rows should be taken into account but the empty string in 8th row should be considered null "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None, None, None, None, None], "Column Three" => DATES, ); let expected_columns_without_whitespace = fe_columns!( "Column One" => [Some(1.0), Some(2.0), Some(3.0), None, Some(5.0), None], "Column Two" => [Some("one"), Some("two"), None, Some("four"), Some("five"), None], "Column Three" => &DATES[0..6], ); let sheet = reader .load_sheet( "Without Table".into(), LoadSheetOrTableOptions::new_for_sheet().whitespace_as_null(true), ) .context(r#"could not load sheet "Without Table""#)?; let sheet_columns = sheet .to_columns() .context("could not convert sheet to columns")?; assert_eq!(sheet_columns, expected_columns_with_whitespace_as_null); let table = reader .load_table( "Table_with_whitespace", LoadSheetOrTableOptions::new_for_table().whitespace_as_null(true), ) .context(r#"could not load table "Table_with_whitespace""#)?; let table_columns = table .to_columns() .context("could not convert table to columns")?; assert_eq!(table_columns, expected_columns_with_whitespace_as_null); let sheet_without_tail_whitespace = reader .load_sheet( "Without Table".into(), LoadSheetOrTableOptions::new_for_sheet() .whitespace_as_null(true) .skip_whitespace_tail_rows(true), ) .context(r#"could not load sheet "Without Table""#)?; let sheet_without_tail_whitespace_columns = sheet_without_tail_whitespace .to_columns() .context("could not convert sheet to columns")?; assert_eq!( sheet_without_tail_whitespace_columns, expected_columns_without_whitespace ); let table_without_tail_whitespace = reader .load_table( "Table_with_whitespace", LoadSheetOrTableOptions::new_for_table() .whitespace_as_null(true) .skip_whitespace_tail_rows(true), ) .context(r#"could not load table "Table_with_whitespace""#)?; let table_columns_without_tail_whitespace = table_without_tail_whitespace .to_columns() .context("could not convert table to columns")?; assert_eq!( table_columns_without_tail_whitespace, expected_columns_without_whitespace ); Ok(()) }