Repository: ToucanToco/fastexcel
Branch: main
Commit: 98bf33293c85
Files: 99
Total size: 46.9 MB
Directory structure:
gitextract_ze2pys5u/
├── .clippy.toml
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── CI.yml
│ ├── docs.yml
│ └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── doc-templates/
│ └── module.html.jinja2
├── pyproject.toml
├── python/
│ ├── fastexcel/
│ │ ├── __init__.py
│ │ ├── _fastexcel.pyi
│ │ └── py.typed
│ └── tests/
│ ├── __init__.py
│ ├── benchmarks/
│ │ ├── README.md
│ │ ├── fixtures/
│ │ │ ├── formulas.xlsx
│ │ │ ├── plain_data.xls
│ │ │ └── plain_data.xlsx
│ │ ├── memory.py
│ │ ├── readers.py
│ │ └── speed.py
│ ├── conftest.py
│ ├── test_alias_generation.py
│ ├── test_column_selection.py
│ ├── test_defined_names.py
│ ├── test_dtypes.py
│ ├── test_durations.py
│ ├── test_eagerness.py
│ ├── test_empty.py
│ ├── test_errors.py
│ ├── test_fastexcel.py
│ ├── test_pycapsule.py
│ ├── test_sheet_visibility.py
│ ├── test_shifted_data.py
│ ├── test_tables.py
│ ├── test_whitespace.py
│ └── utils.py
├── scripts/
│ └── update_versions.py
├── src/
│ ├── data/
│ │ ├── cell_extractors.rs
│ │ ├── mod.rs
│ │ ├── python.rs
│ │ └── rust.rs
│ ├── error.rs
│ ├── lib.rs
│ ├── types/
│ │ ├── dtype/
│ │ │ ├── mod.rs
│ │ │ └── python.rs
│ │ ├── excelreader/
│ │ │ ├── mod.rs
│ │ │ └── python.rs
│ │ ├── excelsheet/
│ │ │ ├── column_info/
│ │ │ │ ├── mod.rs
│ │ │ │ └── python.rs
│ │ │ ├── mod.rs
│ │ │ ├── polars.rs
│ │ │ ├── python.rs
│ │ │ └── table.rs
│ │ ├── exceltable/
│ │ │ ├── mod.rs
│ │ │ └── python.rs
│ │ ├── idx_or_name/
│ │ │ ├── mod.rs
│ │ │ └── python.rs
│ │ └── mod.rs
│ └── utils/
│ ├── mod.rs
│ └── schema.rs
├── test.py
└── tests/
├── column_selection.rs
├── fastexcel.rs
├── fixtures/
│ ├── dates.ods
│ ├── decimal-numbers.xlsx
│ ├── div0.xlsx
│ ├── empty.ods
│ ├── empty.xlsx
│ ├── fixture-changing-header-location.xlsx
│ ├── fixture-invalid-cell-value-num.xlsx
│ ├── fixture-invalid-cell-value.xlsx
│ ├── fixture-multi-dtypes-columns.xlsx
│ ├── fixture-multi-sheet.xlsx
│ ├── fixture-sheets-different-visibilities.xlsx
│ ├── fixture-single-sheet-duplicated-columns.xlsx
│ ├── fixture-single-sheet-with-types.xlsx
│ ├── fixture-single-sheet.xlsx
│ ├── fixture-type-errors.xlsx
│ ├── infer-dtypes-fallback.xlsx
│ ├── no-header.xlsx
│ ├── null-bytes-in-columns-names.xls
│ ├── null-column.xlsx
│ ├── sheet-and-table-with-offset.xlsx
│ ├── sheet-and-table-with-whitespace.xlsx
│ ├── sheet-null-strings-empty.xlsx
│ ├── sheet-null-strings.xlsx
│ ├── sheet-with-defined-names.xlsx
│ ├── sheet-with-na.xlsx
│ ├── sheet-with-tables.xlsx
│ └── single-sheet-skip-rows-durations.xlsx
├── sheet_visibility.rs
├── shifted_data.rs
├── tables.rs
├── utils/
│ └── mod.rs
└── whitespace.rs
================================================
FILE CONTENTS
================================================
================================================
FILE: .clippy.toml
================================================
disallowed-macros = [
{ path = "std::assert_ne", reason = "use `pretty_assertions::assert_ne` instead" },
{ path = "std::assert_eq", reason = "use `pretty_assertions::assert_eq` instead" },
{ path = "std::assert_matches", reason = "use `pretty_assertions::assert_matches` instead" },
]
================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
# python
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
labels:
- "dependencies"
- ":snake: python :snake:"
# rust
- package-ecosystem: "cargo"
directory: "/"
schedule:
interval: "daily"
groups:
prod-deps:
dependency-type: "production"
dev-deps:
dependency-type: "development"
labels:
- "dependencies"
- ":crab: rust :crab:"
# actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
================================================
FILE: .github/workflows/CI.yml
================================================
name: CI
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
env:
MIN_PYTHON_VERSION: "3.10"
defaults:
run:
# Prevents windows runners from running on powershell
shell: bash
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "${{ env.MIN_PYTHON_VERSION }}"
- name: Set up rust toolchain
uses: dtolnay/rust-toolchain@stable
with:
components: rustfmt, clippy
- name: Set up rustfmt
run: rustup component add rustfmt
- name: install uv
uses: astral-sh/setup-uv@v7
- name: Install dependencies and lint
run: |
make install
make lint
check-docs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.11"
- name: Set up rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: install uv
uses: astral-sh/setup-uv@v7
- name: Check documentation
run: |
make install
make doc
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"]
os:
- "ubuntu-latest"
- "ubuntu-24.04-arm"
- "macos-14"
- "windows-latest"
# windows-11-arm excluded: pyarrow is not available for Windows ARM64
# https://github.com/apache/arrow/issues/47195
steps:
- uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Set up rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: install uv
uses: astral-sh/setup-uv@v7
- name: Install dependencies and test
run: |
make install
make test
- name: Test with pandas<3
run: |
uv pip install "pandas<3"
make test-python
check-wheel-build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
# Only testing the build on the smallest supported Python version for abi3 wheels
python-version: ["3.10", "3.14t"]
os: ["ubuntu-latest", "macos-14", "windows-latest"]
architecture: [x86-64, aarch64]
exclude:
# Cross-compiling x86_64 → aarch64 on Windows doesn't work; use windows-11-arm instead
- os: windows-latest
architecture: aarch64
include:
# Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available)
- os: windows-11-arm
python-version: "3.11"
architecture: aarch64
# TODO: re-enable once setup-python supports windows-11-arm + python 3.14t
# (setup-python is currently broken with that combination)
# - os: windows-11-arm
# python-version: "3.14t"
# architecture: aarch64
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- name: Set Rust target
id: target
run: |
TARGET=${{
(matrix.os == 'macos-14' && (matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin'))
|| (matrix.os == 'ubuntu-latest' && (matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || 'x86_64-unknown-linux-gnu'))
|| (matrix.os == 'windows-latest' && 'x86_64-pc-windows-msvc')
|| (matrix.os == 'windows-11-arm' && 'aarch64-pc-windows-msvc')
}}
echo "target=$TARGET" >> $GITHUB_OUTPUT
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (fast)
uses: PyO3/maturin-action@v1
with:
manylinux: auto
command: build
args: "-o dist -i python${{ matrix.python-version }}"
target: ${{ steps.target.outputs.target }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-${{ matrix.os }}-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
check-wheel-build-musllinux:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.14t"]
architecture: [x86-64, aarch64]
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (fast)
uses: PyO3/maturin-action@v1
with:
manylinux: musllinux_1_2
command: build
args: "-o dist -i python${{ matrix.python-version }}"
target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
check-sdist-build:
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- name: build sdist
uses: PyO3/maturin-action@v1
with:
manylinux: auto
command: sdist
args: "-o dist"
- name: upload sdist
uses: actions/upload-artifact@v7
with:
name: sdist
path: dist
================================================
FILE: .github/workflows/docs.yml
================================================
name: Docs
on:
push:
branches:
- main
tags:
- 'v*'
workflow_dispatch:
inputs:
version_tag:
description: 'Tag to build docs for (e.g. v0.18.0). Checks out the tag before building.'
required: true
mark_as_stable:
description: 'Mark this version as the stable default (updates root redirect)'
type: boolean
default: false
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Checkout tag (workflow_dispatch)
if: github.event_name == 'workflow_dispatch'
env:
VERSION_TAG: ${{ github.event.inputs.version_tag }}
run: git checkout "refs/tags/$VERSION_TAG"
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.11"
- name: Set up rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: install uv
uses: astral-sh/setup-uv@v7
- name: Determine version
id: version
env:
INPUT_VERSION_TAG: ${{ github.event.inputs.version_tag }}
INPUT_MARK_AS_STABLE: ${{ github.event.inputs.mark_as_stable }}
run: |
if [[ "$GITHUB_EVENT_NAME" == "workflow_dispatch" ]]; then
echo "version=$INPUT_VERSION_TAG" >> "$GITHUB_OUTPUT"
echo "is_stable=$INPUT_MARK_AS_STABLE" >> "$GITHUB_OUTPUT"
elif [[ "${GITHUB_REF}" == refs/tags/v* ]]; then
echo "version=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT"
echo "is_stable=true" >> "$GITHUB_OUTPUT"
else
echo "version=latest" >> "$GITHUB_OUTPUT"
echo "is_stable=false" >> "$GITHUB_OUTPUT"
fi
- name: Build docs
env:
VERSION: ${{ steps.version.outputs.version }}
run: |
make install
make doc-versioned
- name: Deploy to gh-pages
env:
VERSION: ${{ steps.version.outputs.version }}
IS_STABLE: ${{ steps.version.outputs.is_stable }}
run: |
git config user.name github-actions
git config user.email github-actions@github.com
# Stash built docs
cp -r "docs/$VERSION" /tmp/docs-build
# Switch to gh-pages (gh-pages exists)
git checkout gh-pages
git merge -m 'Merge main' origin/main
# Place versioned docs
rm -rf "docs/$VERSION"
mv /tmp/docs-build "docs/$VERSION"
# Update versions.json and root redirect
STABLE_FLAG=""
if [[ "$IS_STABLE" == "true" ]]; then
STABLE_FLAG="--stable"
fi
./scripts/update_versions.py \
--version "$VERSION" \
--docs-dir docs \
$STABLE_FLAG
git add -f docs
git commit -m "Update docs ($VERSION)" --allow-empty
git push origin gh-pages
================================================
FILE: .github/workflows/release.yml
================================================
name: Release
on:
push:
# Sequence of patterns matched against refs/tags
tags:
- 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
jobs:
linux:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.14t"]
architecture: [x86-64, aarch64]
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (release)
uses: PyO3/maturin-action@v1
with:
manylinux: auto
command: build
args: "--release -o dist -i python${{ matrix.python-version }}"
target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-gnu' || null }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-linux-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
linux-musl:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.14t"]
architecture: [x86-64, aarch64]
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (release)
uses: PyO3/maturin-action@v1
with:
manylinux: musllinux_1_2
command: build
args: "--release -o dist -i python${{ matrix.python-version }}"
target: ${{ matrix.architecture == 'aarch64' && 'aarch64-unknown-linux-musl' || 'x86_64-unknown-linux-musl' }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-linux-musl-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
macos:
runs-on: macos-14
strategy:
matrix:
python-version: ["3.10", "3.14t"]
architecture: [x86-64, aarch64]
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (release)
uses: PyO3/maturin-action@v1
with:
command: build
args: "--release -o dist -i python${{ matrix.python-version }}"
target: ${{ matrix.architecture == 'aarch64' && 'aarch64-apple-darwin' || 'x86_64-apple-darwin' }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-macos-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
windows:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.10", "3.14t"]
os: [windows-latest]
architecture: [x86-64]
include:
# Windows ARM64: build natively on windows-11-arm (3.11 is the minimum available)
- os: windows-11-arm
python-version: "3.11"
architecture: aarch64
# TODO: re-enable once setup-python supports windows-11-arm + python 3.14t
# (setup-python is currently broken with that combination)
# - os: windows-11-arm
# python-version: "3.14t"
# architecture: aarch64
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: build (release)
uses: PyO3/maturin-action@v1
with:
command: build
args: "--release -o dist -i python${{ matrix.python-version }}"
target: ${{ matrix.architecture == 'aarch64' && 'aarch64-pc-windows-msvc' || 'x86_64-pc-windows-msvc' }}
- name: Upload wheels
uses: actions/upload-artifact@v7
with:
name: "wheels-windows-python-${{ matrix.python-version }}-${{ matrix.architecture }}"
path: dist
sdist:
runs-on: "ubuntu-latest"
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: actions/setup-python@v6
with:
python-version: "3.10"
- name: build (sdist)
uses: PyO3/maturin-action@v1
with:
manylinux: auto
command: sdist
args: "-o dist"
- name: Upload sdist
uses: actions/upload-artifact@v7
with:
name: sdist
path: dist
# NOTE: Cannot use a matrix here, as we only want a single release
release:
name: Release
runs-on: ubuntu-latest
needs: [linux, linux-musl, macos, windows, sdist]
permissions:
id-token: write # Required for OIDC token exchange with crates.io
contents: write # Required to be able to create a GitHub release
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: rust-lang/crates-io-auth-action@v1
id: auth
- name: Download Linux wheels
uses: actions/download-artifact@v8
with:
pattern: "wheels-linux-*"
merge-multiple: true
path: wheels-linux
- name: Download MacOS wheels
uses: actions/download-artifact@v8
with:
pattern: "wheels-macos-*"
merge-multiple: true
path: wheels-macos
- name: Download Windows wheels
uses: actions/download-artifact@v8
with:
pattern: "wheels-windows-*"
merge-multiple: true
path: wheels-windows
- name: Download sdist
uses: actions/download-artifact@v8
with:
name: "sdist"
path: sdist
- name: Publish to PyPI
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: "--skip-existing wheels-linux/*.whl wheels-macos/*.whl wheels-windows/*.whl sdist/*.tar.gz"
- name: Publish to crates.io
run: cargo publish
env:
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
- name: Release
uses: softprops/action-gh-release@v3
with:
generate_release_notes: true
files: |
wheels-linux/*.whl
wheels-macos/*.whl
wheels-windows/*.whl
sdist/*.tar.gz
================================================
FILE: .gitignore
================================================
/target
bigfile.*
__pycache__
*.pyc
*.so
*.dat
.DS_Store
.python-version
pyrightconfig.json
.venv
docs
.vscode
.idea
.benchmarks
notebooks
/python/tests/fixtures/~$*.xlsx
.zed
dist
================================================
FILE: .pre-commit-config.yaml
================================================
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: http://github.com/doublify/pre-commit-rust
rev: v1.0
hooks:
- id: cargo-check
- repo: local
hooks:
- id: lint
name: Lint
entry: make lint
types_or: [python, rust]
language: system
pass_filenames: false
- id: format
name: Format
entry: make format
types_or: [python, rust]
language: system
pass_filenames: false
================================================
FILE: Cargo.toml
================================================
[package]
name = "fastexcel"
version = "0.20.2"
description = "A fast excel reader for Rust and Python"
rust-version = "1.85.0"
edition = "2024"
license = "MIT"
homepage = "https://github.com/ToucanToco/fastexcel"
repository = "https://github.com/ToucanToco/fastexcel.git"
readme = "README.md"
include = [
"/pyproject.toml",
"/README.md",
"/LICENSE",
"/Makefile",
"/src",
"/python/fastexcel",
"!__pycache__",
"!*.pyc",
"!*.so",
]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "fastexcel"
crate-type = ["cdylib", "rlib"]
[dependencies]
arrow-array = { version = "^58", features = ["ffi"], optional = true }
arrow-pyarrow = { version = "^58", optional = true }
arrow-schema = { version = "^58", optional = true }
calamine = { version = "^0.35.0", features = ["chrono"] }
chrono = { version = "^0.4.40", default-features = false }
log = "^0.4"
polars-core = { version = ">=0.53", features = [
"dtype-date",
"dtype-datetime",
"dtype-duration",
], optional = true }
pyo3 = { version = "^0.28", features = ["abi3-py310"], optional = true }
pyo3-arrow = { version = "^0.17", default-features = false, optional = true }
pyo3-log = { version = "^0.13.3", optional = true }
[dev-dependencies]
anyhow = "1.0.102"
pretty_assertions = { version = "^1.4.1", features = ["unstable"] }
rstest = { version = "^0.26.1", default-features = false }
# NOTE: This is a hack to bypass pyo3 limitations when testing:
# https://pyo3.rs/v0.22.3/faq.html#i-cant-run-cargo-test-or-i-cant-build-in-a-cargo-workspace-im-having-linker-issues-like-symbol-not-found-or-undefined-reference-to-_pyexc_systemerror
[features]
default = []
__arrow = ["dep:arrow-schema", "dep:arrow-array"]
python = ["__arrow", "dep:pyo3", "dep:pyo3-log", "dep:pyo3-arrow"]
extension-module = ["pyo3/extension-module"]
polars = ["dep:polars-core"]
pyarrow = ["dep:arrow-pyarrow", "python"]
# Private features for internal usage, should not be used directly as they may
# change without notice
__pyo3-tests = [
# feature for tests only. This makes Python::with_gil auto-initialize Python
# interpreters, which allows us to instantiate Python objects in tests
# (see https://pyo3.rs/v0.22.3/features#auto-initialize)
"pyo3/auto-initialize",
"pyarrow",
]
__rust-tests-standalone = []
__rust-tests-polars = ["polars"]
# Private feature for maturin usage, should not be used directly
__maturin = ["extension-module", "pyarrow"]
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2024 ToucanToco
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
.DEFAULT_GOAL := all
sources = python/fastexcel python/tests
export CARGO_TERM_COLOR=$(shell (test -t 0 && echo always) || echo auto)
.PHONY: .uv ## Check that uv is installed
.uv:
@uv -V || echo 'Please install uv: https://docs.astral.sh/uv/getting-started/installation/'
.PHONY: install ## Install the package & dependencies with debug build
install: .uv
uv sync --frozen --group all
uv run maturin develop --uv -E pyarrow,pandas,polars
.PHONY: install-prod ## Install the package & dependencies with release build
install-prod: .uv
uv sync --frozen --group all
uv run maturin develop --uv --release -E pyarrow,pandas,polars
.PHONY: setup-dev ## First-time setup: install + pre-commit hooks
setup-dev: install
uv run pre-commit install --install-hooks
.PHONY: rebuild-lockfiles ## Rebuild lockfiles from scratch, updating all dependencies
rebuild-lockfiles: .uv
uv lock --upgrade
cargo update
.PHONY: build-dev ## Build the development version of the package
build-dev:
uv run maturin build
.PHONY: build-wheel ## Build production wheel and install it
build-wheel:
@rm -rf target/wheels/
uv run maturin build --release
@wheel=$$(ls target/wheels/*.whl); uv pip install --force-reinstall "$$wheel[pandas,polars]"
.PHONY: lint-python ## Lint python source files
lint-python:
uv run ruff check $(sources)
uv run ruff format --check $(sources)
uv run mypy $(sources)
.PHONY: lint-rust ## Lint rust source files
lint-rust:
cargo fmt --all -- --check
# Rust
cargo clippy --tests -- -D warnings
# Python-related code
cargo clippy --features __maturin,__pyo3-tests --tests -- -D warnings
# Rust+polars
cargo clippy --features polars --tests -- -D warnings
.PHONY: lint ## Lint rust and python source files
lint: lint-python lint-rust
.PHONY: format-python ## Auto-format python source files
format-python:
uv run ruff check --fix $(sources)
uv run ruff format $(sources)
.PHONY: format-rust ## Auto-format rust source files
format-rust:
cargo fmt --all
cargo clippy --all-features --tests --fix --lib -p fastexcel --allow-dirty --allow-staged
.PHONY: format ## Auto-format python and rust source files
format: format-rust format-python
.PHONY: test-python ## Run python tests
test-python: install
uv run pytest
.PHONY: test-rust-pyo3 ## Run PyO3 rust tests
test-rust-pyo3:
# --lib to skip integration tests
cargo test --no-default-features --features __pyo3-tests --lib
.PHONY: test-rust-standalone ## Run standalone rust tests
test-rust-standalone:
cargo test --no-default-features --features __rust-tests-standalone
.PHONY: test-rust-polars ## Run polars rust tests
test-rust-polars:
cargo test --no-default-features --features __rust-tests-polars
.PHONY: test-rust ## Run rust tests
test-rust: test-rust-pyo3 test-rust-standalone test-rust-polars
.PHONY: test ## Run all tests
test: test-rust test-python
.PHONY: doc-serve ## Serve documentation with live reload
doc-serve: build-dev
uv run pdoc --template-directory doc-templates python/fastexcel
.PHONY: doc ## Build documentation
doc: build-dev
uv run pdoc --template-directory doc-templates -o docs/latest python/fastexcel
uv run scripts/update_versions.py --version latest --docs-dir docs
cargo doc --no-deps --lib -p fastexcel --features polars
.PHONY: doc-versioned ## Build versioned documentation (CI usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned)
doc-versioned: build-dev
@test -n "$(VERSION)" || (echo "ERROR: VERSION is not set. Usage: VERSION=v0.19.0 [STABLE=1] make doc-versioned" && exit 1)
uv run pdoc --template-directory doc-templates -o docs/$(VERSION) python/fastexcel
uv run scripts/update_versions.py --version $(VERSION) --docs-dir docs $(if $(filter 1,$(STABLE)),--stable,)
.PHONY: all ## Run the standard set of checks performed in CI
all: format build-dev lint test
.PHONY: benchmarks ## Run benchmarks
benchmarks: build-wheel
uv run pytest ./python/tests/benchmarks/speed.py
.PHONY: clean ## Clear local caches and build artifacts
clean:
rm -rf `find . -name __pycache__`
rm -f `find . -type f -name '*.py[co]' `
rm -f `find . -type f -name '*~' `
rm -f `find . -type f -name '.*~' `
rm -rf .cache
rm -rf htmlcov
rm -rf .pytest_cache
rm -rf *.egg-info
rm -f .coverage
rm -f .coverage.*
rm -rf build
rm -rf perf.data*
rm -rf python/fastexcel/*.so
.PHONY: help ## Display this message
help:
@grep -E \
'^.PHONY: .*?## .*$$' $(MAKEFILE_LIST) | \
sort | \
awk 'BEGIN {FS = ".PHONY: |## "}; {printf "\033[36m%-19s\033[0m %s\n", $$2, $$3}'
================================================
FILE: README.md
================================================
# `fastexcel`
A fast excel file reader for Python and Rust.
Docs:
* [Python](https://fastexcel.toucantoco.dev/).
* [Rust](https://docs.rs/fastexcel).
## Stability
The Python library is considered production-ready. The API is mostly stable, and we avoid breaking changes as much as
possible. v1.0.0 will be released once the [milestone](https://github.com/ToucanToco/fastexcel/milestone/2) is reached.
> ⚠️ The free-threaded build is still considered experimental
The Rust crate is still experimental, and breaking changes are to be expected.
## Installation
```bash
# Lightweight installation (no PyArrow dependency)
pip install fastexcel
# With Polars support only (no PyArrow needed)
pip install fastexcel[polars]
# With Pandas support (includes PyArrow)
pip install fastexcel[pandas]
# With PyArrow support
pip install fastexcel[pyarrow]
# With all integrations
pip install fastexcel[pandas,polars]
```
## Quick Start
### Modern usage (recommended)
FastExcel supports the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) for zero-copy data exchange with libraries like Polars, without requiring pyarrow as a dependency.
Use fastexcel with any Arrow-compatible library without requiring pyarrow.
```python
import fastexcel
# Load an Excel file
reader = fastexcel.read_excel("data.xlsx")
sheet = reader.load_sheet(0) # Load first sheet
# Use with Polars (zero-copy, no pyarrow needed)
import polars as pl
df = pl.DataFrame(sheet) # Direct PyCapsule interface
print(df)
# Or use the to_polars() method (also via PyCapsule)
df = sheet.to_polars()
print(df)
# Or access the raw Arrow data via PyCapsule interface
schema = sheet.__arrow_c_schema__()
array_data = sheet.__arrow_c_array__()
```
### Traditional usage (with pandas/pyarrow)
```python
import fastexcel
reader = fastexcel.read_excel("data.xlsx")
sheet = reader.load_sheet(0)
# Convert to pandas (requires `pandas` extra)
df = sheet.to_pandas()
# Or get pyarrow RecordBatch directly
record_batch = sheet.to_arrow()
```
### Working with tables
```python
reader = fastexcel.read_excel("data.xlsx")
# List available tables
tables = reader.table_names()
print(f"Available tables: {tables}")
# Load a specific table
table = reader.load_table("MyTable")
df = pl.DataFrame(table) # Zero-copy via PyCapsule, no pyarrow needed
```
## Key Features
- **Zero-copy data exchange** via [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
- **Flexible dependencies** - use with Polars (no PyArrow needed) or Pandas (includes PyArrow)
- **Seamless Polars integration** - `pl.DataFrame(sheet)` and `sheet.to_polars()` work without PyArrow via PyCapsule interface
- **High performance** - written in Rust with [calamine](https://github.com/tafia/calamine) and [Apache Arrow](https://arrow.apache.org/)
- **Memory efficient** - lazy loading and optional eager evaluation
- **Type safety** - automatic type inference with manual override options
## Contributing & Development
### Prerequisites
You'll need:
1. **[Rust](https://rustup.rs/)** - Rust stable or nightly
2. **[uv](https://docs.astral.sh/uv/getting-started/installation/)** - Fast Python package manager (will install Python 3.10+ automatically)
3. **[git](https://git-scm.com/)** - For version control
4. **[make](https://www.gnu.org/software/make/)** - For running development commands
**Python Version Management:**
uv handles Python installation automatically. To use a specific Python version:
```bash
uv python install 3.13 # Install Python 3.13
uv python pin 3.13 # Pin project to Python 3.13
```
### Quick Start
```bash
# Clone the repository (or from your fork)
git clone https://github.com/ToucanToco/fastexcel.git
cd fastexcel
# First-time setup: install dependencies, build debug version, and setup pre-commit hooks
make setup-dev
```
Verify your installation by running:
```bash
make
```
This runs a full development cycle: formatting, building, linting, and testing
### Development Commands
Run `make help` to see all available commands, or use these common ones:
```bash
make all # full dev cycle: format, build, lint, test
make install # install with debug build (daily development)
make install-prod # install with release build (benchmarking)
make test # to run the tests
make lint # to run the linter
make format # to format python and rust code
make doc-serve # to serve the documentation locally
```
### Useful Resources
* [`python/fastexcel/_fastexcel.pyi`](./python/fastexcel/_fastexcel.pyi) - Python API types
* [`python/tests/`](./python/tests) - Comprehensive usage examples
## Benchmarking
For benchmarking, use `make benchmarks` which automatically builds an optimised wheel.
This is required for profiling, as dev mode builds are much slower.
### Speed benchmarks
```bash
make benchmarks
```
### Memory profiling
```bash
mprof run -T 0.01 python python/tests/benchmarks/memory.py python/tests/benchmarks/fixtures/plain_data.xls
```
## Creating a release
1. Create a PR containing a commit that only updates the version in `Cargo.toml`.
2. Once it is approved, squash and merge it into main.
3. Tag the squashed commit, and push it.
4. The `release` GitHub action will take care of the rest.
## Dev tips
* Use `cargo check` to verify that your rust code compiles, no need to go through `maturin` every time
* `cargo clippy` = 💖
* Careful with arrow constructors, they tend to allocate a lot
* [`mprof`](https://github.com/pythonprofilers/memory_profiler) and `time` go a long way for perf checks,
no need to go fancy right from the start
================================================
FILE: doc-templates/module.html.jinja2
================================================
{% extends "default/module.html.jinja2" %}
{% block nav_title %}
{{ super() }}
{% endblock %}
================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["maturin>=1.7.0,<2.0"]
build-backend = "maturin"
[project]
name = "fastexcel"
description = "A fast excel file reader for Python, written in Rust"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Rust",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Free Threading :: 1 - Unstable"
]
dependencies = ["typing-extensions>=4.0.0; python_version<'3.10'"]
dynamic = ["version"]
[project.optional-dependencies]
pyarrow = ["pyarrow>=8.0.0"]
pandas = ["pandas>=1.4.4", "pyarrow>=8.0.0"]
polars = ["polars>=1"]
[dependency-groups]
dev = ["maturin>=1.7.0,<2.0"]
testing = [
{ include-group = "dev" },
"pytest>=7.1.3",
"pytest-benchmark>=4.0.0,<6",
"pytest-mock>=3.1",
"pyarrow>=8.0.0",
"pandas>=1.4.4",
"polars>=0.16.14",
"openpyxl>=3.1.2,<4",
"xlrd>=2.0.1,<3",
]
linting = [
{ include-group = "dev" },
"mypy>=2,<3",
"pre-commit>=2.20.0,<5",
"ruff>=0.15",
]
docs = [{ include-group = "dev" }, "pdoc"]
all = [
{ include-group = "testing" },
{ include-group = "linting" },
{ include-group = "docs" },
]
[project.urls]
"Source Code" = "https://github.com/ToucanToco/fastexcel"
Issues = "https://github.com/ToucanToco/fastexcel"
[tool.maturin]
python-source = "python"
module-name = "fastexcel._fastexcel"
features = ["__maturin"]
[tool.mypy]
python_version = "3.10"
follow_imports = "silent"
ignore_missing_imports = true
# A few custom options
show_error_codes = true
warn_no_return = true
warn_unused_configs = true
warn_unused_ignores = true
[tool.pytest.ini_options]
testpaths = "python/tests"
log_cli = true
log_cli_level = "INFO"
[tool.ruff]
line-length = 100
target-version = "py310"
[tool.ruff.lint]
# Enable Pyflakes `E` and `F` codes by default.
select = ["E", "F", "I", "Q", "FA102", "UP"]
[tool.uv]
# this ensures that `uv run` doesn't actually build the package; a `make`
# command is needed to build
package = false
required-version = '>=0.8.4'
================================================
FILE: python/fastexcel/__init__.py
================================================
from __future__ import annotations
import typing
from collections.abc import Callable
from typing import TYPE_CHECKING, Literal, TypeAlias
if TYPE_CHECKING:
import pandas as pd
import polars as pl
import pyarrow as pa
from os.path import expanduser
from pathlib import Path
try:
import importlib.util
importlib.util.find_spec("pyarrow")
_PYARROW_AVAILABLE = True
except ImportError:
_PYARROW_AVAILABLE = False
from ._fastexcel import (
ArrowError,
CalamineCellError,
CalamineError,
CannotRetrieveCellDataError,
CellError,
CellErrors,
ColumnInfo,
ColumnInfoNoDtype,
ColumnNotFoundError,
DefinedName,
FastExcelError,
InvalidParametersError,
SheetNotFoundError,
UnsupportedColumnTypeCombinationError,
__version__,
_ExcelReader,
_ExcelSheet,
_ExcelTable,
)
from ._fastexcel import read_excel as _read_excel
DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap: TypeAlias = "dict[str | int, DType]"
ColumnNameFrom: TypeAlias = Literal["provided", "looked_up", "generated"]
DTypeFrom: TypeAlias = Literal[
"provided_for_all", "provided_by_index", "provided_by_name", "guessed"
]
SheetVisible: TypeAlias = Literal["visible", "hidden", "veryhidden"]
class ExcelSheet:
"""A class representing a single sheet in an Excel File"""
def __init__(self, sheet: _ExcelSheet) -> None:
self._sheet = sheet
@property
def name(self) -> str:
"""The name of the sheet"""
return self._sheet.name
@property
def width(self) -> int:
"""The sheet's width"""
return self._sheet.width
@property
def height(self) -> int:
"""The sheet's height, with `skip_rows` and `nrows` applied"""
return self._sheet.height
@property
def total_height(self) -> int:
"""The sheet's total height"""
return self._sheet.total_height
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
return self._sheet.selected_columns
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
return self._sheet.available_columns()
@property
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the sheet"""
return self._sheet.specified_dtypes
@property
def visible(self) -> SheetVisible:
"""The visibility of the sheet"""
return self._sheet.visible
def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`
Requires the `pyarrow` extra to be installed.
"""
if not _PYARROW_AVAILABLE:
raise ImportError(
"pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
)
return self._sheet.to_arrow()
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors | None]:
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
Stores the positions of any values that cannot be parsed as the specified type and were
therefore converted to None.
Requires the `pyarrow` extra to be installed.
"""
if not _PYARROW_AVAILABLE:
raise ImportError(
"pyarrow is required for to_arrow_with_errors(). Install with: pip install 'fastexcel[pyarrow]'" # noqa: E501
)
rb, cell_errors = self._sheet.to_arrow_with_errors()
if not cell_errors.errors:
return (rb, None)
return (rb, cell_errors)
def to_pandas(self) -> pd.DataFrame:
"""Converts the sheet to a Pandas `DataFrame`.
Requires the `pandas` extra to be installed.
"""
# Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
# which we don't implement. Using pyarrow conversion for now.
# (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
return self.to_arrow().to_pandas()
def to_polars(self) -> pl.DataFrame:
"""Converts the sheet to a Polars `DataFrame`.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the `polars` extra to be installed.
"""
import polars as pl
return pl.DataFrame(self)
def __arrow_c_schema__(self) -> object:
"""Export the schema as an `ArrowSchema` `PyCapsule`.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
return self._sheet.__arrow_c_schema__()
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
The optional `requested_schema` parameter allows for potential schema conversion.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
return self._sheet.__arrow_c_array__(requested_schema)
def __repr__(self) -> str:
return self._sheet.__repr__()
class ExcelTable:
"""A class representing a single table in an Excel file"""
def __init__(self, table: _ExcelTable) -> None:
self._table = table
@property
def name(self) -> str:
"""The name of the table"""
return self._table.name
@property
def sheet_name(self) -> str:
"""The name of the sheet this table belongs to"""
return self._table.sheet_name
@property
def width(self) -> int:
"""The table's width"""
return self._table.width
@property
def height(self) -> int:
"""The table's height"""
return self._table.height
@property
def total_height(self) -> int:
"""The table's total height"""
return self._table.total_height
@property
def offset(self) -> int:
"""The table's offset before data starts"""
return self._table.offset
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The table's selected columns"""
return self._table.selected_columns
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given table"""
return self._table.available_columns()
@property
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the table"""
return self._table.specified_dtypes
def to_arrow(self) -> pa.RecordBatch:
"""Converts the table to a pyarrow `RecordBatch`
Requires the `pyarrow` extra to be installed.
"""
if not _PYARROW_AVAILABLE:
raise ImportError(
"pyarrow is required for to_arrow(). Install with: pip install 'fastexcel[pyarrow]'"
)
return self._table.to_arrow()
def to_pandas(self) -> pd.DataFrame:
"""Converts the table to a Pandas `DataFrame`.
Requires the `pandas` extra to be installed.
"""
# Note: pandas PyCapsule interface requires __dataframe__ or __arrow_c_stream__
# which we don't implement. Using pyarrow conversion for now.
# (see https://pandas.pydata.org/docs/reference/api/pandas.api.interchange.from_dataframe.html)
return self.to_arrow().to_pandas()
def to_polars(self) -> pl.DataFrame:
"""Converts the table to a Polars `DataFrame`.
Uses the Arrow PyCapsule Interface for zero-copy data exchange.
Requires the `polars` extra to be installed.
"""
import polars as pl
return pl.DataFrame(self)
def __arrow_c_schema__(self) -> object:
"""Export the schema as an `ArrowSchema` `PyCapsule`.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
return self._table.__arrow_c_schema__()
def __arrow_c_array__(self, requested_schema: object | None = None) -> tuple[object, object]:
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
The optional `requested_schema` parameter allows for potential schema conversion.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
return self._table.__arrow_c_array__(requested_schema)
class ExcelReader:
"""A class representing an open Excel file and allowing to read its sheets"""
def __init__(self, reader: _ExcelReader) -> None:
self._reader = reader
@property
def sheet_names(self) -> list[str]:
"""The list of sheet names"""
return self._reader.sheet_names
@typing.overload
def load_sheet(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> ExcelSheet: ...
@typing.overload
def load_sheet(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> pa.RecordBatch: ...
def load_sheet(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: bool = False,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> ExcelSheet | pa.RecordBatch:
"""Loads a sheet by index or name.
:param idx_or_name: The index (starting at 0) or the name of the sheet to load.
:param header_row: The index of the row containing the column labels, default index is 0.
If `None`, the sheet does not have any column labels.
Any rows before the `header_row` will be automatically skipped.
:param column_names: Overrides headers found in the document.
If `column_names` is used, `header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded.
If `None`, all rows are loaded
:param skip_rows: Specifies which rows should be skipped after the `header_row`.
Any rows before the `header_row` are automatically skipped.
It means row indices are relative to data rows, not the sheet!
Can be one of:
- `int`: Skip this many rows after the header row
- `list[int]`: Skip specific row indices (0-based relative to data rows)
- `Callable[[int], bool]`: Function that receives row index (0-based
relative to data rows) and returns True to skip the row
- `None`: If `header_row` is None, skips empty rows at beginning
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the `dtypes` parameter.
If `None`, all rows will be used.
:param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
will try to coerce different dtypes in a column to the same one,
whereas `strict` will raise an error in case a column contains
several dtypes. Note that this only applies to columns whose dtype
is guessed, i.e. not specified via `dtypes`.
:param use_columns: Specifies the columns to use. Can either be:
- `None` to select all columns
- A list of strings and ints, the column names and/or indices
(starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
`A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
(e.g. `"B:"` to select all columns from B onwards) and from-beginning
ranges (e.g. `":C"` to select columns from A to C). These can be
combined for "except" patterns (e.g. `":C,E:"` to select everything
except column D)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dtype (for all columns)
or dict of dtypes with keys as column indices or names.
:param eager: Specifies whether the sheet should be loaded eagerly.
`False` (default) will load the sheet lazily using the `PyCapsule` interface,
whereas `True` will load it eagerly via `pyarrow`.
Eager loading requires the `pyarrow` extra to be installed.
:param skip_whitespace_tail_rows: Skip rows at the end of the sheet
containing only whitespace and null values.
:param whitespace_as_null: Consider cells containing only whitespace as null values.
"""
sheet_or_rb = self._reader.load_sheet(
idx_or_name=idx_or_name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
eager=eager,
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
whitespace_as_null=whitespace_as_null,
)
return sheet_or_rb if eager else ExcelSheet(sheet_or_rb)
def table_names(self, sheet_name: str | None = None) -> list[str]:
"""The list of table names.
Will return an empty list if no tables are found.
:param sheet_name: If given, will limit the list to the given sheet, will be faster
too.
"""
return self._reader.table_names(sheet_name)
def defined_names(self) -> list[DefinedName]:
"""The list of defined names (named ranges) in the workbook.
Returns a list of DefinedName objects with 'name' and 'formula' attributes.
The formula is a string representation of the range or expression.
Will return an empty list if no defined names are found.
"""
return self._reader.defined_names()
@typing.overload
def load_table(
self,
name: str,
*,
header_row: int | None = None,
column_names: list[str] | None = None,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> ExcelTable: ...
@typing.overload
def load_table(
self,
name: str,
*,
header_row: int | None = None,
column_names: list[str] | None = None,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> pa.RecordBatch: ...
def load_table(
self,
name: str,
*,
header_row: int | None = None,
column_names: list[str] | None = None,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: bool = False,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> ExcelTable | pa.RecordBatch:
"""Loads a table by name.
:param name: The name of the table to load.
:param header_row: The index of the row containing the column labels.
If `None`, the table's column names will be used.
Any rows before the `header_row` will be automatically skipped.
:param column_names: Overrides headers found in the document.
If `column_names` is used, `header_row` will be ignored.
:param n_rows: Specifies how many rows should be loaded.
If `None`, all rows are loaded
:param skip_rows: Specifies how many rows should be skipped after the `header_row`.
Any rows before the `header_row` are automatically skipped.
If `header_row` is `None`, it skips the number of rows from the
start of the sheet.
:param schema_sample_rows: Specifies how many rows should be used to determine
the dtype of a column. Cannot be 0. A specific dtype can be
enforced for some or all columns through the `dtypes` parameter.
If `None`, all rows will be used.
:param dtype_coercion: Specifies how type coercion should behave. `coerce` (the default)
will try to coerce different dtypes in a column to the same one,
whereas `strict` will raise an error in case a column contains
several dtypes. Note that this only applies to columns whose dtype
is guessed, i.e. not specified via `dtypes`.
:param use_columns: Specifies the columns to use. Can either be:
- `None` to select all columns
- A list of strings and ints, the column names and/or indices
(starting at 0)
- A string, a comma separated list of Excel column letters and column
ranges (e.g. `"A:E"` or `"A,C,E:F"`, which would result in
`A,B,C,D,E` and `A,C,E,F`). Also supports open-ended ranges
(e.g. `"B:"` to select all columns from B onwards) and from-beginning
ranges (e.g. `":C"` to select columns from A to C). These can be
combined for "except" patterns (e.g. `":C,E:"` to select everything
except column D)
- A callable, a function that takes a column and returns a boolean
indicating whether the column should be used
:param dtypes: An optional dtype (for all columns)
or dict of dtypes with keys as column indices or names.
:param eager: Specifies whether the table should be loaded eagerly.
`False` (default) will load the table lazily using the `PyCapsule` interface,
whereas `True` will load it eagerly via `pyarrow`.
Eager loading requires the `pyarrow` extra to be installed.
:param skip_whitespace_tail_rows: Skip rows at the end of the table
containing only whitespace and null values.
:param whitespace_as_null: Consider cells containing only whitespace as null values.
"""
if eager:
return self._reader.load_table(
name=name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
eager=True,
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
whitespace_as_null=whitespace_as_null,
)
else:
return ExcelTable(
self._reader.load_table(
name=name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
eager=False,
skip_whitespace_tail_rows=skip_whitespace_tail_rows,
whitespace_as_null=whitespace_as_null,
)
)
def load_sheet_eager(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str] | list[int] | str | None = None,
dtypes: DType | DTypeMap | None = None,
) -> pa.RecordBatch:
"""Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
`worksheet_range_ref` under the hood, which returns borrowed types.
Refer to `load_sheet` for parameter documentation
Requires the `pyarrow` extra to be installed.
"""
return self._reader.load_sheet(
idx_or_name=idx_or_name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
eager=True,
)
def load_sheet_by_name(
self,
name: str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by name.
Refer to `load_sheet` for parameter documentation
"""
return self.load_sheet(
name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
)
def load_sheet_by_idx(
self,
idx: int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index.
Refer to `load_sheet` for parameter documentation
"""
return self.load_sheet(
idx,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
dtype_coercion=dtype_coercion,
use_columns=use_columns,
dtypes=dtypes,
)
def __repr__(self) -> str:
return self._reader.__repr__()
def read_excel(source: Path | str | bytes) -> ExcelReader:
"""Opens and loads an excel file.
:param source: The path to a file or its content as bytes
"""
if isinstance(source, str | Path):
source = expanduser(source)
return ExcelReader(_read_excel(source))
__all__ = (
# version
"__version__",
# main entrypoint
"read_excel",
# Python types
"DType",
"DTypeMap",
# Excel reader
"ExcelReader",
# Excel sheet
"ExcelSheet",
# Excel table
"ExcelTable",
# Column metadata
"DTypeFrom",
"ColumnNameFrom",
"ColumnInfo",
# Defined names
"DefinedName",
# Parse error information
"CellError",
"CellErrors",
# Exceptions
"FastExcelError",
"CannotRetrieveCellDataError",
"CalamineCellError",
"CalamineError",
"SheetNotFoundError",
"ColumnNotFoundError",
"ArrowError",
"InvalidParametersError",
"UnsupportedColumnTypeCombinationError",
)
================================================
FILE: python/fastexcel/_fastexcel.pyi
================================================
from __future__ import annotations
import typing
from collections.abc import Callable
from typing import TYPE_CHECKING, Literal
if TYPE_CHECKING:
import pyarrow as pa
DType = Literal["null", "int", "float", "string", "boolean", "datetime", "date", "duration"]
DTypeMap = dict[str | int, DType]
ColumnNameFrom = Literal["provided", "looked_up", "generated"]
DTypeFrom = Literal["provided_for_all", "provided_by_index", "provided_by_name", "guessed"]
SheetVisible = Literal["visible", "hidden", "veryhidden"]
class ColumnInfoNoDtype:
def __init__(
self,
*,
name: str,
index: int,
absolute_index: int,
column_name_from: ColumnNameFrom,
) -> None: ...
@property
def name(self) -> str: ...
@property
def index(self) -> int: ...
@property
def absolute_index(self) -> int: ...
@property
def column_name_from(self) -> ColumnNameFrom: ...
class ColumnInfo:
def __init__(
self,
*,
name: str,
index: int,
absolute_index: int,
column_name_from: ColumnNameFrom,
dtype: DType,
dtype_from: DTypeFrom,
) -> None: ...
@property
def name(self) -> str: ...
@property
def index(self) -> int: ...
@property
def absolute_index(self) -> int: ...
@property
def dtype(self) -> DType: ...
@property
def column_name_from(self) -> ColumnNameFrom: ...
@property
def dtype_from(self) -> DTypeFrom: ...
class DefinedName:
def __init__(
self,
*,
name: str,
formula: str,
) -> None: ...
@property
def name(self) -> str: ...
@property
def formula(self) -> str: ...
class CellError:
@property
def position(self) -> tuple[int, int]: ...
@property
def row_offset(self) -> int: ...
@property
def offset_position(self) -> tuple[int, int]: ...
@property
def detail(self) -> str: ...
def __repr__(self) -> str: ...
class CellErrors:
@property
def errors(self) -> list[CellError]: ...
def __repr__(self) -> str: ...
class _ExcelSheet:
@property
def name(self) -> str:
"""The name of the sheet"""
@property
def width(self) -> int:
"""The sheet's width"""
@property
def height(self) -> int:
"""The sheet's height"""
@property
def total_height(self) -> int:
"""The sheet's total height"""
@property
def offset(self) -> int:
"""The sheet's offset before data starts"""
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The sheet's selected columns"""
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given sheet"""
@property
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the sheet"""
@property
def visible(self) -> SheetVisible:
"""The visibility of the sheet"""
def to_arrow(self) -> pa.RecordBatch:
"""Converts the sheet to a pyarrow `RecordBatch`
Requires the `pyarrow` extra to be installed.
"""
def to_arrow_with_errors(self) -> tuple[pa.RecordBatch, CellErrors]:
"""Converts the sheet to a pyarrow `RecordBatch` with error information.
Stores the positions of any values that cannot be parsed as the specified type and were
therefore converted to None.
Requires the `pyarrow` extra to be installed.
"""
def __arrow_c_schema__(self) -> object:
"""Export the schema as an `ArrowSchema` `PyCapsule`.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
The optional `requested_schema` parameter allows for potential schema conversion.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
class _ExcelTable:
@property
def name(self) -> str:
"""The name of the table"""
@property
def sheet_name(self) -> str:
"""The name of the sheet this table belongs to"""
@property
def width(self) -> int:
"""The table's width"""
@property
def height(self) -> int:
"""The table's height"""
@property
def total_height(self) -> int:
"""The table's total height"""
@property
def offset(self) -> int:
"""The table's offset before data starts"""
@property
def selected_columns(self) -> list[ColumnInfo]:
"""The table's selected columns"""
def available_columns(self) -> list[ColumnInfo]:
"""The columns available for the given table"""
@property
def specified_dtypes(self) -> DTypeMap | None:
"""The dtypes specified for the table"""
def to_arrow(self) -> pa.RecordBatch:
"""Converts the table to a pyarrow `RecordBatch`
Requires the `pyarrow` extra to be installed.
"""
def __arrow_c_schema__(self) -> object:
"""Export the schema as an `ArrowSchema` `PyCapsule`.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowschema-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
def __arrow_c_array__(self, requested_schema: object = None) -> tuple[object, object]:
"""Export the schema and data as a pair of `ArrowSchema` and `ArrowArray` `PyCapsules`.
The optional `requested_schema` parameter allows for potential schema conversion.
https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export
The Arrow PyCapsule Interface enables zero-copy data exchange with
Arrow-compatible libraries without requiring PyArrow as a dependency.
"""
class _ExcelReader:
"""A class representing an open Excel file and allowing to read its sheets"""
@typing.overload
def load_sheet(
self,
idx_or_name: str | int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> _ExcelSheet: ...
@typing.overload
def load_sheet(
self,
idx_or_name: str | int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> pa.RecordBatch: ...
@typing.overload
def load_sheet(
self,
idx_or_name: str | int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: bool = False,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> pa.RecordBatch: ...
@typing.overload
def load_table(
self,
name: str,
*,
header_row: int | None = None,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[False] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> _ExcelTable: ...
@typing.overload
def load_table(
self,
name: str,
*,
header_row: int | None = None,
column_names: list[str] | None = None,
skip_rows: int | list[int] | Callable[[int], bool] | None = None,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
dtype_coercion: Literal["coerce", "strict"] = "coerce",
use_columns: list[str]
| list[int]
| str
| Callable[[ColumnInfoNoDtype], bool]
| None = None,
dtypes: DType | DTypeMap | None = None,
eager: Literal[True] = ...,
skip_whitespace_tail_rows: bool = False,
whitespace_as_null: bool = False,
) -> pa.RecordBatch: ...
@property
def sheet_names(self) -> list[str]: ...
def table_names(self, sheet_name: str | None = None) -> list[str]: ...
def defined_names(self) -> list[DefinedName]: ...
def read_excel(source: str | bytes) -> _ExcelReader:
"""Reads an excel file and returns an ExcelReader"""
__version__: str
# Exceptions
class FastExcelError(Exception): ...
class UnsupportedColumnTypeCombinationError(FastExcelError): ...
class CannotRetrieveCellDataError(FastExcelError): ...
class CalamineCellError(FastExcelError): ...
class CalamineError(FastExcelError): ...
class SheetNotFoundError(FastExcelError): ...
class ColumnNotFoundError(FastExcelError): ...
class ArrowError(FastExcelError): ...
class InvalidParametersError(FastExcelError): ...
================================================
FILE: python/fastexcel/py.typed
================================================
================================================
FILE: python/tests/__init__.py
================================================
================================================
FILE: python/tests/benchmarks/README.md
================================================
# Benchmarks
These benchmarks were generated using `pytest-benchmark`.
> **_NOTE:_** formulas.xlsx was found [here](https://foss.heptapod.net/openpyxl/openpyxl/-/issues/494) plain_data.xls and plain_data.xlsx can be found [here](https://public.opendatasoft.com/explore/dataset/covid-19-pandemic-worldwide-data/export/?disjunctive.zone&disjunctive.category)
Using the following command:
```bash
make benchmarks
```
The results are from my local machine. This is not 100% accurate.
## Speed
### 'xls': 2 tests
|Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds|Iterations|
|-----------------|---|---|----|------|------|---|-------|---|-------|----------|
|test_fastexcel_xls|27.0991 (1.0)|33.7495 (1.0)|29.5819 (1.0)|1.6429 (1.0)|29.3559 (1.0)|2.7158 (1.0)|10;0|33.8044 (1.0)|29|1|
|test_xlrd|596.5040 (22.01)|628.7964 (18.63)|612.5730 (20.71)|12.9967 (7.91)|615.1620 (20.96)|20.7911 (7.66)|2;0|1.6325 (0.05)|5|1|
### 'xlsx': 4 tests
|Name (time in ms)|Min|Max|Mean|StdDev|Median|IQR|Outliers|OPS|Rounds Iterations|
|-----------------|---|---|----|------|------|---|--------|---|------------------|
|test_fastexcel_xlsx|437.5810 (1.0)|470.7615 (1.0)|457.9611 (1.0)|13.7401 (1.0)|457.7006 (1.0)|21.0743 (1.25)|1;0|2.1836 (1.0)|5|1|
|test_fastexcel_with_formulas|3,106.7454 (7.10)|3,150.2050 (6.69)|3,122.5234 (6.82)|16.6031 (1.21)|3,120.9000 (6.82)|16.8614 (1.0)|1;0 0.3203 (0.15)|5|1|
|test_pyxl|4,780.2341 (10.92)|4,998.7753 (10.62)|4,899.6885 (10.70)|110.4665 (8.04)|4,948.7550 (10.81)|211.6149 (12.55)|2;0|0.2041 (0.09)|5|1|
test_pyxl_with_formulas|25,312.8494 (57.85)|26,621.4687 (56.55)|25,808.5418 (56.36)|545.0540 (39.67)|25,748.0901 (56.26)|852.3171 (50.55)|1;0|0.0387 (0.02)|5|1|
## Memory usage
| fastexcel memory usage | other memory usage |
|-|-|
| ||
| ||
| ||
================================================
FILE: python/tests/benchmarks/fixtures/formulas.xlsx
================================================
[File too large to display: 46.5 MB]
================================================
FILE: python/tests/benchmarks/memory.py
================================================
import argparse
from enum import Enum
from .readers import fastexcel_read, pyxl_read, xlrd_read
class Engine(str, Enum):
FASTEXCEL = "fastexcel"
XLRD = "xlrd"
OPENPYXL = "pyxl"
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", default=Engine.FASTEXCEL)
parser.add_argument("file")
return parser.parse_args()
def main():
args = get_args()
engine = args.engine
if engine == Engine.FASTEXCEL:
fastexcel_read(args.file)
elif engine == Engine.XLRD:
xlrd_read(args.file)
elif engine == Engine.OPENPYXL:
pyxl_read(args.file)
if __name__ == "__main__":
main()
================================================
FILE: python/tests/benchmarks/readers.py
================================================
from fastexcel import read_excel
from openpyxl import load_workbook
from xlrd import open_workbook
def pyxl_read(test_file_path: str):
wb = load_workbook(test_file_path, read_only=True, keep_links=False, data_only=True)
for ws in wb:
rows = ws.iter_rows()
rows = ws.values
for row in rows:
for _ in row:
pass
def xlrd_read(test_file_path: str):
wb = open_workbook(test_file_path)
for ws in wb.sheets():
for idx in range(ws.nrows):
for _ in ws.row_values(idx):
pass
def fastexcel_read(test_file_path: str):
reader = read_excel(test_file_path)
for sheet_name in reader.sheet_names:
sheet = reader.load_sheet_by_name(sheet_name)
sheet.to_arrow()
================================================
FILE: python/tests/benchmarks/speed.py
================================================
"""
Compare read performance with fastexcel, xlrd and different openpyxl options
"""
import pytest
from .readers import fastexcel_read, pyxl_read, xlrd_read
@pytest.fixture
def plain_data_xls():
return "./python/tests/benchmarks/fixtures/plain_data.xls"
@pytest.fixture
def plain_data_xlsx():
return "./python/tests/benchmarks/fixtures/plain_data.xlsx"
@pytest.fixture
def formula_xlsx():
return "./python/tests/benchmarks/fixtures/formulas.xlsx"
@pytest.mark.benchmark(group="xlsx")
def test_pyxl(benchmark, plain_data_xlsx):
benchmark(pyxl_read, plain_data_xlsx)
@pytest.mark.benchmark(group="xls")
def test_xlrd(benchmark, plain_data_xls):
benchmark(xlrd_read, plain_data_xls)
@pytest.mark.benchmark(group="xls")
def test_fastexcel_xls(benchmark, plain_data_xls):
benchmark(fastexcel_read, plain_data_xls)
@pytest.mark.benchmark(group="xlsx")
def test_fastexcel_xlsx(benchmark, plain_data_xlsx):
benchmark(fastexcel_read, plain_data_xlsx)
@pytest.mark.benchmark(group="xlsx")
def test_pyxl_with_formulas(benchmark, formula_xlsx):
benchmark(pyxl_read, formula_xlsx)
@pytest.mark.benchmark(group="xlsx")
def test_fastexcel_with_formulas(benchmark, formula_xlsx):
benchmark(fastexcel_read, formula_xlsx)
================================================
FILE: python/tests/conftest.py
================================================
from __future__ import annotations
from datetime import datetime
from typing import Any
import pytest
@pytest.fixture
def expected_data_sheet_null_strings() -> dict[str, list[Any]]:
return {
"FIRST_LABEL": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
"SECOND_LABEL": ["AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ"],
"DATES_AND_NULLS": [
None,
None,
None,
datetime(2022, 12, 19, 0, 0),
datetime(2022, 8, 26, 0, 0),
datetime(2023, 5, 6, 0, 0),
datetime(2023, 3, 20, 0, 0),
datetime(2022, 8, 29, 0, 0),
None,
None,
],
"TIMESTAMPS_AND_NULLS": [
None,
None,
datetime(2023, 2, 18, 6, 13, 56, 730000),
datetime(2022, 9, 20, 20, 0, 7, 50000),
datetime(2022, 9, 24, 17, 4, 31, 236000),
None,
None,
None,
datetime(2022, 9, 14, 1, 50, 58, 390000),
datetime(2022, 10, 21, 17, 20, 12, 223000),
],
"INTS_AND_NULLS": [
2076.0,
2285.0,
39323.0,
None,
None,
None,
11953.0,
None,
30192.0,
None,
],
"FLOATS_AND_NULLS": [
141.02023312814603,
778.0655928608671,
None,
497.60307287584106,
627.446112513911,
None,
None,
None,
488.3509486743364,
None,
],
}
================================================
FILE: python/tests/test_alias_generation.py
================================================
from __future__ import annotations
import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import path_for_fixture
@pytest.mark.parametrize(
"use_columns", [None, [0, 1, 2], ["col", "col_1", "col_2"], [0, "col_1", 2]]
)
def test_alias_generation_with_use_columns(use_columns: list[str] | list[int] | None) -> None:
excel_reader = fastexcel.read_excel(
path_for_fixture("fixture-single-sheet-duplicated-columns.xlsx")
)
sheet = excel_reader.load_sheet(0, use_columns=use_columns)
assert [col.name for col in sheet.available_columns()] == ["col", "col_1", "col_2"]
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"col": [1.0, 2.0],
"col_1": [2019.0, 2020.0],
"col_2": pd.Series(
[pd.Timestamp("2019-02-01 00:01:02"), pd.Timestamp("2014-01-02 06:01:02")]
).astype("datetime64[ms]"),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"col": [1.0, 2.0],
"col_1": [2019.0, 2020.0],
"col_2": ["2019-02-01 00:01:02", "2014-01-02 06:01:02"],
}
).with_columns(pl.col("col_2").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
================================================
FILE: python/tests/test_column_selection.py
================================================
# ruff: noqa: E501
from __future__ import annotations
import re
from typing import Any
import fastexcel
import numpy as np
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import path_for_fixture
@pytest.fixture
def excel_reader_single_sheet() -> fastexcel.ExcelReader:
return fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
@pytest.fixture
def expected_column_info() -> list[fastexcel.ColumnInfo]:
return [
fastexcel.ColumnInfo(
name="Month",
index=0,
absolute_index=0,
column_name_from="looked_up",
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="Year",
index=1,
absolute_index=1,
column_name_from="looked_up",
dtype="float",
dtype_from="guessed",
),
]
def test_single_sheet_all_columns(
excel_reader_single_sheet: fastexcel.ExcelReader,
expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
sheet = excel_reader_single_sheet.load_sheet(0)
sheet_explicit_arg = excel_reader_single_sheet.load_sheet(0, use_columns=None)
assert sheet.selected_columns == expected_column_info
assert sheet.available_columns() == expected_column_info
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
expected_pd_df = pd.DataFrame(expected)
expected_pl_df = pl.DataFrame(expected)
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
pd_df_explicit_arg = sheet_explicit_arg.to_pandas()
pd_assert_frame_equal(pd_df_explicit_arg, expected_pd_df)
pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pl_df_explicit_arg = sheet_explicit_arg.to_polars()
pl_assert_frame_equal(pl_df_explicit_arg, expected_pl_df)
def test_single_sheet_subset_by_str(
excel_reader_single_sheet: fastexcel.ExcelReader,
expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
# looks like mypy 1.8 became more stupid
sheets: list[str | int] = [0, "January"]
for sheet_name_or_idx in sheets:
for idx, col in enumerate(["Month", "Year"]):
sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[col])
assert sheet.selected_columns == [expected_column_info[idx]]
assert sheet.available_columns() == expected_column_info
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col: expected[col]}))
pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, pl.DataFrame({col: expected[col]}))
def test_single_sheet_subset_by_index(
excel_reader_single_sheet: fastexcel.ExcelReader,
expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
sheets: list[str | int] = [0, "January"]
for sheet_name_or_idx in sheets:
for idx, col_name in enumerate(["Month", "Year"]):
sheet = excel_reader_single_sheet.load_sheet(sheet_name_or_idx, use_columns=[idx])
assert sheet.selected_columns == [expected_column_info[idx]]
assert sheet.available_columns() == expected_column_info
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame({col_name: expected[col_name]}))
pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, pl.DataFrame({col_name: expected[col_name]}))
@pytest.fixture
def excel_reader_single_sheet_with_unnamed_columns() -> fastexcel.ExcelReader:
return fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
@pytest.fixture
def single_sheet_with_unnamed_columns_expected() -> dict[str, list[Any]]:
return {
"col1": [2.0, 3.0],
"__UNNAMED__1": [1.5, 2.5],
"col3": ["hello", "world"],
"__UNNAMED__3": [-5.0, -6.0],
"col5": ["a", "b"],
}
@pytest.fixture
def sheet_with_unnamed_columns_expected_column_info() -> list[fastexcel.ColumnInfo]:
return [
fastexcel.ColumnInfo(
name="col1",
index=0,
absolute_index=0,
column_name_from="looked_up",
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__1",
index=1,
absolute_index=1,
column_name_from="generated",
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="col3",
index=2,
absolute_index=2,
column_name_from="looked_up",
dtype="string",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
absolute_index=3,
column_name_from="generated",
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="col5",
index=4,
absolute_index=4,
column_name_from="looked_up",
dtype="string",
dtype_from="guessed",
),
]
def test_single_sheet_with_unnamed_columns(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
use_columns_idx = [0, 2, 3]
expected = {
k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in use_columns_str
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == [
sheet_with_unnamed_columns_expected_column_info[0],
sheet_with_unnamed_columns_expected_column_info[2],
sheet_with_unnamed_columns_expected_column_info[3],
]
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx
)
assert sheet.selected_columns == [
sheet_with_unnamed_columns_expected_column_info[0],
sheet_with_unnamed_columns_expected_column_info[2],
sheet_with_unnamed_columns_expected_column_info[3],
]
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_pagination(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
use_columns_str = ["col1", "col3", "__UNNAMED__3"]
use_columns_idx = [0, 2, 3]
# first row only
expected = {
k: v[:1]
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in use_columns_str
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, n_rows=1
)
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, n_rows=1
)
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
# second row
expected = {
k: v[1:]
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in use_columns_str
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=1
)
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1
)
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
use_columns_str = ["col0", "col2", "col3"]
use_columns_idx = [0, 2, 3]
expected: dict[str, list[Any]] = {
"col0": [2.0, 3.0],
"col1": ["hello", "world"],
"col2": [-5.0, -6.0],
}
column_names = [f"col{i}" for i in range(3)]
expected_columns_names = ["col0", "__UNNAMED__1", "col1", "col2", "__UNNAMED__4"]
# skipping the header row only
with pytest.raises(
fastexcel.InvalidParametersError,
match='use_columns can only contain integers when used with columns_names, got "col0"',
):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns",
use_columns=use_columns_str,
skip_rows=1,
column_names=column_names,
)
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
)
assert [col.name for col in sheet.available_columns()] == expected_columns_names
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
# skipping the header row + first data row
expected_first_row_skipped = {k: v[1:] for k, v in expected.items()}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
)
assert [col.name for col in sheet.available_columns()] == expected_columns_names
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
def test_single_sheet_with_unnamed_columns_and_str_range(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
use_columns_str = "A,C:E"
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["col1", "col3", "__UNNAMED__3", "col5"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == (
sheet_with_unnamed_columns_expected_column_info[:1]
+ sheet_with_unnamed_columns_expected_column_info[2:]
)
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_open_ended_range(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test B: (should get columns B, C, D, E - indices 1, 2, 3, 4)
use_columns_str = "B:"
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["__UNNAMED__1", "col3", "__UNNAMED__3", "col5"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[1:]
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_open_ended_range_from_start(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test A: (should get all columns)
use_columns_str = "A:"
expected = single_sheet_with_unnamed_columns_expected
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_mixed_open_ended_range(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test A,C: (should get column A and columns from C onwards - indices 0, 2, 3, 4)
use_columns_str = "A,C:"
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["col1", "col3", "__UNNAMED__3", "col5"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
expected_selected_cols = [
sheet_with_unnamed_columns_expected_column_info[0]
] + sheet_with_unnamed_columns_expected_column_info[2:]
assert sheet.selected_columns == expected_selected_cols
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_from_beginning_range(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test :C (should get columns A, B, C - indices 0, 1, 2)
use_columns_str = ":C"
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["col1", "__UNNAMED__1", "col3"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == sheet_with_unnamed_columns_expected_column_info[:3]
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_from_beginning_range_single_column(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test :A (should get only column A - index 0)
use_columns_str = ":A"
expected = {
k: v for k, v in single_sheet_with_unnamed_columns_expected.items() if k in ["col1"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
assert sheet.selected_columns == [sheet_with_unnamed_columns_expected_column_info[0]]
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_with_unnamed_columns_and_complex_mixed_pattern(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
single_sheet_with_unnamed_columns_expected: dict[str, list[Any]],
sheet_with_unnamed_columns_expected_column_info: list[fastexcel.ColumnInfo],
) -> None:
# Test A,:B,D,E: (should get A, A,B again (deduplicated), D, and E)
# This effectively becomes A,B,D,E (columns 0,1,3,4)
use_columns_str = "A,:B,D,E:"
expected = {
k: v
for k, v in single_sheet_with_unnamed_columns_expected.items()
if k in ["col1", "__UNNAMED__1", "__UNNAMED__3", "col5"]
}
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str
)
# Expected: columns A, A,B (from :B), D, E (from E:)
# After deduplication: 0,1,3,4
expected_selected_cols = [
sheet_with_unnamed_columns_expected_column_info[0], # A
sheet_with_unnamed_columns_expected_column_info[1], # B
sheet_with_unnamed_columns_expected_column_info[3], # D
sheet_with_unnamed_columns_expected_column_info[4], # E
]
assert sheet.selected_columns == expected_selected_cols
assert sheet.available_columns() == sheet_with_unnamed_columns_expected_column_info
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_single_sheet_invalid_column_indices_negative_integer(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """invalid parameters: expected list[int] | list[str], got [-2]
Context:
0: could not determine selected columns from provided object: [-2]
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([-2])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[-2])
def test_single_sheet_invalid_column_indices_empty_list(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """invalid parameters: list of selected columns is empty
Context:
0: could not determine selected columns from provided object: []
1: expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got Some([])
"""
with pytest.raises(fastexcel.InvalidParametersError, match=re.escape(expected_message)):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[])
def test_single_sheet_invalid_column_indices_column_does_not_exist_str(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """column with name \"nope\" not found
Context:
0: available columns are: .*
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=["nope"])
def test_single_sheet_invalid_column_indices_column_does_not_exist_int(
excel_reader_single_sheet_with_unnamed_columns: fastexcel.ExcelReader,
) -> None:
expected_message = """column at index 42 not found
Context:
0: available columns are: .*
"""
with pytest.raises(fastexcel.ColumnNotFoundError, match=expected_message):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(0, use_columns=[42])
def test_use_columns_with_column_names() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
sheet = excel_reader.load_sheet(
0,
use_columns=[1, 2],
header_row=None,
skip_rows=1,
column_names=["bools_renamed", "dates_renamed"],
)
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="__UNNAMED__0",
column_name_from="generated",
index=0,
absolute_index=0,
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="bools_renamed",
index=1,
absolute_index=1,
dtype="boolean",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="dates_renamed",
index=2,
absolute_index=2,
dtype="datetime",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
absolute_index=3,
dtype="float",
dtype_from="guessed",
column_name_from="generated",
),
]
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": ["2022-03-02 05:43:04"] * 3,
}
).with_columns(
pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)
def test_use_columns_with_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
sheet = excel_reader.load_sheet(2)
assert (
[(c.name, c.dtype) for c in sheet.available_columns()]
== [(c.name, c.dtype) for c in sheet.selected_columns]
== [
("col1", "float"),
("__UNNAMED__1", "float"),
("col3", "string"),
("__UNNAMED__3", "float"),
("col5", "string"),
]
)
sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.name.startswith("col"),
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("col1", "float"),
("col3", "string"),
("col5", "string"),
]
sheet = excel_reader.load_sheet(
2,
use_columns=lambda col: col.index % 2 == 1,
)
assert [(c.name, c.dtype) for c in sheet.selected_columns] == [
("__UNNAMED__1", "float"),
("__UNNAMED__3", "float"),
]
def test_use_columns_with_bad_callable() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
with pytest.raises(
fastexcel.InvalidParametersError,
match=re.escape("`use_columns` callable could not be called (TypeError: "),
):
excel_reader.load_sheet(
2,
use_columns=lambda: True, # type: ignore
)
with pytest.raises(
fastexcel.InvalidParametersError, match="`use_columns` callable should return a boolean"
):
excel_reader.load_sheet(
2,
use_columns=lambda _: 42, # type: ignore
)
def test_use_columns_with_eager_loading() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
expected_months = [1.0, 2.0]
expected_years = [2019.0, 2020.0]
# default
rb = excel_reader.load_sheet_eager(0)
assert rb.schema.names == ["Month", "Year"]
assert rb["Year"].tolist() == expected_years
assert rb["Month"].tolist() == expected_months
# changing order
rb = excel_reader.load_sheet_eager(0, use_columns=["Year", "Month"])
assert rb.schema.names == ["Year", "Month"]
assert rb["Year"].tolist() == expected_years
assert rb["Month"].tolist() == expected_months
# subset
rb = excel_reader.load_sheet_eager(0, use_columns=["Year"])
assert rb.schema.names == ["Year"]
assert rb["Year"].tolist() == expected_years
assert "Month" not in (field.name for field in rb.schema)
@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_use_columns_dtypes_eager_loading(
excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]
) -> None:
expected_pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
)
expected_pd_df = pd.DataFrame(expected_data_sheet_null_strings)
expected_pd_df["DATES_AND_NULLS"] = expected_pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
expected_pd_df["TIMESTAMPS_AND_NULLS"] = expected_pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
for use_columns in (
list(expected_data_sheet_null_strings.keys()),
[key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2],
[key for idx, key in enumerate(expected_data_sheet_null_strings.keys()) if idx % 2 == 0],
list(reversed(expected_data_sheet_null_strings.keys())),
[
key
for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
if idx % 2
],
[
key
for idx, key in enumerate(reversed(expected_data_sheet_null_strings.keys()))
if idx % 2 == 0
],
):
excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
sheet = excel_reader.load_sheet_eager(0, use_columns=use_columns)
pd_df = sheet.to_pandas()
pl_df = pl.from_arrow(data=sheet)
assert isinstance(pl_df, pl.DataFrame)
sheet_lazy = excel_reader.load_sheet(0, use_columns=use_columns)
pl_df_lazy = sheet_lazy.to_polars()
pd_df_lazy = sheet_lazy.to_pandas()
pl_assert_frame_equal(pl_df_lazy, pl_df)
pd_assert_frame_equal(pd_df_lazy, pd_df)
pl_assert_frame_equal(expected_pl_df.select(use_columns), pl_df)
pd_assert_frame_equal(expected_pd_df[use_columns], pd_df)
assert pd_df.columns.to_list() == use_columns
assert pl_df.columns == use_columns
def test_use_columns_with_table() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table = excel_reader.load_table("users", use_columns=["User Id", "FirstName"])
expected_available_columns = [
fastexcel.ColumnInfo(
name="User Id",
index=0,
absolute_index=0,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="FirstName",
index=1,
absolute_index=1,
dtype="string",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__2",
index=2,
absolute_index=2,
dtype="string",
column_name_from="generated",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
absolute_index=3,
dtype="datetime",
column_name_from="generated",
dtype_from="guessed",
),
]
expected_selected_columns = [
fastexcel.ColumnInfo(
name="User Id",
index=0,
absolute_index=0,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="FirstName",
index=1,
absolute_index=1,
dtype="string",
column_name_from="provided",
dtype_from="guessed",
),
]
assert table.available_columns() == expected_available_columns
assert table.selected_columns == expected_selected_columns
expected_pl_df = pl.DataFrame(
{"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
)
expected_pd_df = pd.DataFrame(
{"User Id": [1.0, 2.0, 5.0], "FirstName": ["Peter", "John", "Hans"]}
)
pl_df = table.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pd_df = table.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
def test_use_columns_with_table_and_provided_columns() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table = excel_reader.load_table(
"users", use_columns=[0, 2], column_names=["user_id", "last_name"]
)
expected_available_columns = [
fastexcel.ColumnInfo(
name="user_id",
index=0,
absolute_index=0,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__1",
index=1,
absolute_index=1,
dtype="string",
column_name_from="generated",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="last_name",
index=2,
absolute_index=2,
dtype="string",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
absolute_index=3,
dtype="datetime",
column_name_from="generated",
dtype_from="guessed",
),
]
expected_selected_columns = [
fastexcel.ColumnInfo(
name="user_id",
index=0,
absolute_index=0,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="last_name",
index=2,
absolute_index=2,
dtype="string",
column_name_from="provided",
dtype_from="guessed",
),
]
assert table.available_columns() == expected_available_columns
assert table.selected_columns == expected_selected_columns
expected_pl_df = pl.DataFrame(
{"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
)
expected_pd_df = pd.DataFrame(
{"user_id": [1.0, 2.0, 5.0], "last_name": ["Müller", "Meier", "Fricker"]}
)
pl_df = table.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pd_df = table.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
def test_use_column_range_with_offset_without_table() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
sheet = excel_reader.load_sheet("without-table", use_columns="H:I", header_row=9)
expected_pl_df = pl.DataFrame(
{
"Column at H10": [1.0, 2.0, 3.0],
"Column at I10": [4.0, 5.0, 6.0],
}
)
expected_pd_df = pd.DataFrame(
{
"Column at H10": [1.0, 2.0, 3.0],
"Column at I10": [4.0, 5.0, 6.0],
}
)
pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
def test_use_column_range_with_offset_with_table() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
sheet = excel_reader.load_sheet("with-table", use_columns="D:E", header_row=4)
expected_pl_df = pl.DataFrame(
{
"Column at D5": [1.0, 2.0, 3.0, 4.0],
"Column at E5": [4.0, 5.0, 6.0, 8.0],
}
)
expected_pd_df = pd.DataFrame(
{
"Column at D5": [1.0, 2.0, 3.0, 4.0],
"Column at E5": [4.0, 5.0, 6.0, 8.0],
}
)
pl_df = sheet.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
def test_use_column_names_with_offset_table_by_index_and_name() -> None:
"""Index-based selection should resolve correctly when used with an offset table.
The selected indices should be absolute, and it should be able to handle both index-based
and name-based selection.
"""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
# Mix name-based and index-based selection
# "Column at D5" is at table index 0, absolute index 3
# Index 4 is absolute index for column E
table = excel_reader.load_table("TableAtD5", use_columns=["Column at D5", 4]) # type:ignore[arg-type]
expected_selected_columns = [
fastexcel.ColumnInfo(
name="Column at D5",
index=0,
absolute_index=3,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="Column at E5",
index=1,
absolute_index=4,
dtype="float",
column_name_from="provided",
dtype_from="guessed",
),
]
assert table.selected_columns == expected_selected_columns
expected_pl_df = pl.DataFrame(
{
"Column at D5": [1.0, 2.0, 3.0, 4.0],
"Column at E5": [4.0, 5.0, 6.0, 8.0],
}
)
expected_pd_df = pd.DataFrame(
{
"Column at D5": [1.0, 2.0, 3.0, 4.0],
"Column at E5": [4.0, 5.0, 6.0, 8.0],
}
)
pl_df = table.to_polars()
pl_assert_frame_equal(pl_df, expected_pl_df)
pd_df = table.to_pandas()
pd_assert_frame_equal(pd_df, expected_pd_df)
def test_use_column_range_with_offset_with_table_and_specified_dtypes() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
table_closed = excel_reader.load_table(
"TableAtD5", use_columns="D:E", dtypes={3: "int", "Column at E5": "string"}
)
table_open_ended = excel_reader.load_table(
"TableAtD5", use_columns="D:", dtypes={3: "int", "Column at E5": "string"}
)
expected_data = {
# Dtype should be int, looked up by index
"Column at D5": [1, 2, 3, 4],
# Dtype should be string, looked up by name
"Column at E5": ["4", "5", "6", "8"],
}
expected_column_info = [
fastexcel.ColumnInfo(
name="Column at D5",
index=0,
absolute_index=3,
dtype="int",
dtype_from="provided_by_index",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="Column at E5",
index=1,
absolute_index=4,
dtype="string",
dtype_from="provided_by_name",
column_name_from="provided",
),
]
assert table_closed.selected_columns == expected_column_info
assert table_open_ended.selected_columns == expected_column_info
expected_pl_df = pl.DataFrame(expected_data)
expected_pd_df = pd.DataFrame(expected_data)
pl_df_closed = table_closed.to_polars()
pl_assert_frame_equal(pl_df_closed, expected_pl_df)
pl_df_open_ended = table_open_ended.to_polars()
pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)
pd_df_closed = table_closed.to_pandas()
pd_assert_frame_equal(pd_df_closed, expected_pd_df)
pd_df_open_ended = table_open_ended.to_pandas()
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df)
def test_use_column_range_with_offset_with_sheet_and_specified_dtypes() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
sheet_closed = excel_reader.load_sheet(
"without-table",
use_columns="H:K",
header_row=9,
dtypes={7: "int", "Column at I10": "string"},
)
sheet_open_ended = excel_reader.load_sheet(
"without-table",
use_columns="H:",
header_row=9,
dtypes={7: "int", "Column at I10": "string"},
)
expected_data_polars = {
# Dtype should be int, looked up by index
"Column at H10": [1, 2, 3],
# Dtype should be string, looked up by name
"Column at I10": ["4", "5", "6"],
"__UNNAMED__2": pl.Series([None, None, None], dtype=pl.String),
"Column at K10": [7.0, 8.0, 9.0],
}
# In pandas 3, string columns use nan instead of None for missing values
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
na_value = np.nan if pd_version >= (3, 0) else None
expected_data_pandas = {
# Dtype should be int, looked up by index
"Column at H10": [1, 2, 3],
# Dtype should be string, looked up by name
"Column at I10": ["4", "5", "6"],
"__UNNAMED__2": [na_value, na_value, na_value],
"Column at K10": [7.0, 8.0, 9.0],
}
expected_column_info = [
fastexcel.ColumnInfo(
name="Column at H10",
index=0,
absolute_index=7,
dtype="int",
dtype_from="provided_by_index",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Column at I10",
index=1,
absolute_index=8,
dtype="string",
dtype_from="provided_by_name",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="__UNNAMED__2",
index=2,
absolute_index=9,
dtype="string",
dtype_from="guessed",
column_name_from="generated",
),
fastexcel.ColumnInfo(
name="Column at K10",
index=3,
absolute_index=10,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
]
assert sheet_closed.selected_columns == expected_column_info
assert sheet_open_ended.selected_columns == expected_column_info
expected_pl_df = pl.DataFrame(expected_data_polars)
expected_pd_df = pd.DataFrame(expected_data_pandas)
pl_df_closed = sheet_closed.to_polars()
pl_assert_frame_equal(pl_df_closed, expected_pl_df)
pl_df_open_ended = sheet_open_ended.to_polars()
pl_assert_frame_equal(pl_df_open_ended, expected_pl_df)
pd_df_closed = sheet_closed.to_pandas()
pd_assert_frame_equal(pd_df_closed, expected_pd_df, check_dtype=False)
pd_df_open_ended = sheet_open_ended.to_pandas()
pd_assert_frame_equal(pd_df_open_ended, expected_pd_df, check_dtype=False)
================================================
FILE: python/tests/test_defined_names.py
================================================
import fastexcel
import pytest
from .utils import path_for_fixture
@pytest.mark.parametrize("path", ("sheet-with-defined-names.xlsx",))
def test_defined_names(path: str) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture(path))
defined_names = excel_reader.defined_names()
expected_defined_names = [
fastexcel.DefinedName(name="AddingValues", formula="SUM(sheet1!$K$5:$K$6)"),
fastexcel.DefinedName(name="DefinedRange", formula="sheet1!$A$5:$D$7"),
fastexcel.DefinedName(name="NamedConstant", formula="3.4"),
]
assert defined_names == expected_defined_names
================================================
FILE: python/tests/test_dtypes.py
================================================
from __future__ import annotations
import logging
from datetime import date, datetime
from typing import Any, Literal
import fastexcel
import numpy as np
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import get_expected_pandas_dtype, path_for_fixture
@pytest.fixture
def expected_data() -> dict[str, list[Any]]:
return {
"Employee ID": [
"123456",
"44333",
"44333",
"87878",
"87878",
"US00011",
"135967",
"IN86868",
"IN86868",
],
"Employee Name": [
"Test1",
"Test2",
"Test2",
"Test3",
"Test3",
"Test4",
"Test5",
"Test6",
"Test6",
],
"Date": [datetime(2023, 7, 21)] * 9,
"Details": ["Healthcare"] * 7 + ["Something"] * 2,
"Asset ID": ["84444"] * 7 + ["ABC123"] * 2,
"Mixed dates": ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3,
"Mixed bools": ["true"] * 5 + ["false"] * 3 + ["other"],
}
def test_sheet_with_mixed_dtypes(expected_data: dict[str, list[Any]]) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0)
pd_df = sheet.to_pandas()
pd_assert_frame_equal(pd_df, pd.DataFrame(expected_data).astype({"Date": "datetime64[ms]"}))
pl_df = sheet.to_polars()
pl_assert_frame_equal(
pl_df, pl.DataFrame(expected_data, schema_overrides={"Date": pl.Datetime(time_unit="ms")})
)
def test_sheet_with_mixed_dtypes_and_sample_rows(expected_data: dict[str, list[Any]]) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
# Since we skip rows here, the dtypes should be correctly guessed, even if we only check 5 rows
sheet = excel_reader.load_sheet(0, schema_sample_rows=5, skip_rows=5)
expected_data_subset = {col_name: values[5:] for col_name, values in expected_data.items()}
pd_df = sheet.to_pandas()
pd_assert_frame_equal(
pd_df, pd.DataFrame(expected_data_subset).astype({"Date": "datetime64[ms]"})
)
pl_df = sheet.to_polars()
pl_assert_frame_equal(
pl_df,
pl.DataFrame(expected_data_subset, schema_overrides={"Date": pl.Datetime(time_unit="ms")}),
)
# Guess the sheet's dtypes on 5 rows only
sheet = excel_reader.load_sheet(0, schema_sample_rows=5)
# String fields should not have been loaded
expected_data["Employee ID"] = [
123456.0,
44333.0,
44333.0,
87878.0,
87878.0,
None,
135967.0,
None,
None,
]
expected_data["Asset ID"] = [84444.0] * 7 + [None] * 2
expected_data["Mixed dates"] = [datetime(2023, 7, 21)] * 6 + [None] * 3
expected_data["Mixed bools"] = [True] * 5 + [False] * 3 + [None]
pd_df = sheet.to_pandas()
pd_assert_frame_equal(
pd_df,
pd.DataFrame(expected_data).astype(
{
"Date": "datetime64[ms]",
"Mixed dates": "datetime64[ms]",
}
),
)
pl_df = sheet.to_polars()
pl_assert_frame_equal(
pl_df,
pl.DataFrame(
expected_data,
schema_overrides={
"Date": pl.Datetime(time_unit="ms"),
"Mixed dates": pl.Datetime(time_unit="ms"),
},
),
)
@pytest.mark.parametrize("dtype_by_index", (True, False))
@pytest.mark.parametrize(
"dtype,expected_data,expected_pl_dtype",
[
("int", [123456, 44333, 44333, 87878, 87878], pl.Int64),
("float", [123456.0, 44333.0, 44333.0, 87878.0, 87878.0], pl.Float64),
("string", ["123456", "44333", "44333", "87878", "87878"], pl.Utf8),
("boolean", [True] * 5, pl.Boolean),
(
"datetime",
[datetime(2238, 1, 3)] + [datetime(2021, 5, 17)] * 2 + [datetime(2140, 8, 6)] * 2,
pl.Datetime,
),
(
"date",
[date(2238, 1, 3)] + [date(2021, 5, 17)] * 2 + [date(2140, 8, 6)] * 2,
pl.Date,
),
# conversion to duration not supported yet
("duration", [pd.NaT] * 5, pl.Duration),
],
)
def test_sheet_with_mixed_dtypes_specify_dtypes(
dtype_by_index: bool,
dtype: fastexcel.DType,
expected_data: list[Any],
expected_pl_dtype: pl.DataType,
) -> None:
dtypes: fastexcel.DTypeMap = {0: dtype} if dtype_by_index else {"Employee ID": dtype}
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes=dtypes, n_rows=5)
assert sheet.specified_dtypes == dtypes
pd_df = sheet.to_pandas()
expected_pd_dtype = get_expected_pandas_dtype(dtype)
assert pd_df["Employee ID"].dtype == expected_pd_dtype
assert pd_df["Employee ID"].to_list() == expected_data
pl_df = sheet.to_polars()
assert pl_df["Employee ID"].dtype == expected_pl_dtype
assert pl_df["Employee ID"].to_list() == (expected_data if dtype != "duration" else [None] * 5)
@pytest.mark.parametrize(
"dtypes,expected,fastexcel_dtype,expected_pl_dtype",
[
(None, datetime(2023, 7, 21), "datetime", pl.Datetime),
({"Date": "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
({"Date": "date"}, date(2023, 7, 21), "date", pl.Date),
({"Date": "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
({2: "datetime"}, datetime(2023, 7, 21), "datetime", pl.Datetime),
({2: "date"}, date(2023, 7, 21), "date", pl.Date),
({2: "string"}, "2023-07-21 00:00:00", "string", pl.Utf8),
],
)
def test_sheet_datetime_conversion(
dtypes: fastexcel.DTypeMap | None,
expected: Any,
fastexcel_dtype: str,
expected_pl_dtype: pl.DataType,
) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes=dtypes)
assert sheet.specified_dtypes == dtypes
pd_df = sheet.to_pandas()
expected_pd_dtype = get_expected_pandas_dtype(fastexcel_dtype)
assert pd_df["Date"].dtype == expected_pd_dtype
assert pd_df["Date"].to_list() == [expected] * 9
pl_df = sheet.to_polars()
assert pl_df["Date"].dtype == expected_pl_dtype
assert pl_df["Date"].to_list() == [expected] * 9
@pytest.mark.parametrize("eager", [True, False])
@pytest.mark.parametrize("dtype_coercion", ["coerce", None])
def test_dtype_coercion_behavior__coerce(
dtype_coercion: Literal["coerce"] | None, eager: bool
) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
kwargs = {"dtype_coercion": dtype_coercion} if dtype_coercion else {}
sheet_or_rb = (
excel_reader.load_sheet(0, eager=eager, **kwargs) # type:ignore[call-overload]
)
rb = sheet_or_rb if eager else sheet_or_rb.to_arrow()
pd_df = rb.to_pandas()
expected_pd_dtype = get_expected_pandas_dtype("string")
assert pd_df["Mixed dates"].dtype == expected_pd_dtype
assert pd_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
pl_df = pl.from_arrow(data=rb)
assert isinstance(pl_df, pl.DataFrame)
assert pl_df["Mixed dates"].dtype == pl.Utf8
assert pl_df["Mixed dates"].to_list() == ["2023-07-21 00:00:00"] * 6 + ["July 23rd"] * 3
@pytest.mark.parametrize("eager", [True, False])
def test_dtype_coercion_behavior__strict_sampling_eveything(eager: bool) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
with pytest.raises(
fastexcel.UnsupportedColumnTypeCombinationError, match="type coercion is strict"
):
if eager:
excel_reader.load_sheet_eager(0, dtype_coercion="strict")
else:
excel_reader.load_sheet(0, dtype_coercion="strict").to_arrow()
@pytest.mark.parametrize("eager", [True, False])
def test_dtype_coercion_behavior__strict_sampling_limit(eager: bool) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = (
excel_reader.load_sheet_eager(0, dtype_coercion="strict", schema_sample_rows=5)
if eager
else excel_reader.load_sheet(0, dtype_coercion="strict", schema_sample_rows=5).to_arrow()
)
pd_df = sheet.to_pandas()
assert pd_df["Mixed dates"].dtype == "datetime64[ms]"
assert (
pd_df["Mixed dates"].to_list() == [pd.Timestamp("2023-07-21 00:00:00")] * 6 + [pd.NaT] * 3
)
assert pd_df["Asset ID"].dtype == "float64"
assert pd_df["Asset ID"].replace(np.nan, None).to_list() == [84444.0] * 7 + [None] * 2
pl_df = pl.from_arrow(data=sheet)
assert isinstance(pl_df, pl.DataFrame)
assert pl_df["Mixed dates"].dtype == pl.Datetime
assert pl_df["Mixed dates"].to_list() == [datetime(2023, 7, 21)] * 6 + [None] * 3
assert pl_df["Asset ID"].dtype == pl.Float64
assert pl_df["Asset ID"].to_list() == [84444.0] * 7 + [None] * 2
def test_one_dtype_for_all() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-dtypes-columns.xlsx"))
sheet = excel_reader.load_sheet(0, dtypes="string")
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="Employee ID",
index=0,
absolute_index=0,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Employee Name",
index=1,
absolute_index=1,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Date",
index=2,
absolute_index=2,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Details",
index=3,
absolute_index=3,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Asset ID",
index=4,
absolute_index=4,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Mixed dates",
index=5,
absolute_index=5,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Mixed bools",
index=6,
absolute_index=6,
dtype="string",
dtype_from="provided_for_all",
column_name_from="looked_up",
),
]
assert sheet.to_polars().dtypes == [pl.String] * 7
def test_fallback_infer_dtypes(caplog: pytest.LogCaptureFixture) -> None:
"""it should fallback to string if it can't infer the dtype"""
excel_reader = fastexcel.read_excel(path_for_fixture("infer-dtypes-fallback.xlsx"))
sheet = excel_reader.load_sheet(0)
# Ensure a warning message was logged to explain the fallback to string
assert caplog.record_tuples == [
(
"fastexcel.types.dtype",
logging.WARNING,
"Could not determine dtype for column 1, falling back to string",
)
]
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="id",
index=0,
absolute_index=0,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="label",
index=1,
absolute_index=1,
dtype="string",
dtype_from="guessed",
column_name_from="looked_up",
),
]
assert sheet.to_polars().dtypes == [pl.Float64, pl.String]
@pytest.mark.parametrize(
("dtype", "expected_data"),
[
(
"int",
[None] * 2
+ [-1.0, 0.0, 1.0, 0.0, 1.0, 1.0, -1.0, 0.0, 1.0, None, 1.0, 0.0]
+ [None] * 7
+ [0.0],
),
(
"float",
[None] * 2
+ [-1.0, 0.0, 1.0, 0.0, 1.0, 1.1, -1.0, 0.0, 1.0, 1.1, 1.0, 0.0]
+ [None] * 7
+ [0.1],
),
(
"string",
[
None,
"foo",
"-1",
"0",
"1",
"0",
"1",
"1.1",
"-1",
"0",
"1",
"1.1",
"true",
"false",
"2023-07-21 00:00:00",
"2023-07-21 12:20:00",
# calamine reads a time as datetimes here, which seems wrong
"1899-12-31 12:20:00",
"07/21/2023",
"7/21/2023 12:20:00 PM",
"July 23rd",
"12:20:00",
"0.1",
],
),
(
"boolean",
[None] * 2
+ [True, False, True, False, True, True]
+ [None] * 4
+ [True, False]
+ [None] * 7
+ [True],
),
(
"datetime",
[pd.NaT] * 2
+ [
pd.Timestamp("1899-12-30 00:00:00"),
pd.Timestamp("1899-12-31 00:00:00"),
pd.Timestamp("1900-01-01 00:00:00"),
pd.Timestamp("1899-12-31 00:00:00"),
pd.Timestamp("1900-01-01 00:00:00"),
pd.Timestamp("1900-01-01 02:24:00"),
]
+ [pd.NaT] * 6
+ [
pd.Timestamp("2023-7-21 00:00:00"),
pd.Timestamp("2023-7-21 12:20:00"),
# calamine currently adds a date to a time, which is
# questionable
pd.Timestamp("1899-12-31 12:20:00"),
]
+ [pd.NaT] * 4
+ [
# calamine converts percentages to datetimes (since it does not
# distinguish from floats), which seems questionable
pd.Timestamp("1899-12-31 02:24:00")
],
),
(
"date",
[None] * 2
+ [
pd.Timestamp("1899-12-30").date(),
pd.Timestamp("1899-12-31").date(),
pd.Timestamp("1900-01-01").date(),
pd.Timestamp("1899-12-31").date(),
pd.Timestamp("1900-01-01").date(),
pd.Timestamp("1900-01-01").date(),
]
+ [None] * 6
+ [
pd.Timestamp("2023-7-21").date(),
pd.Timestamp("2023-7-21").date(),
# calamine converts any time to 1899-12-31, which is
# questionable
pd.Timestamp("1899-12-31").date(),
]
+ [None] * 4
+ [
# calamine converts percentages to dates (since it does not
# distinguish from floats), which seems questionable
pd.Timestamp("1899-12-31").date()
],
),
(
"duration",
[pd.NaT] * 14
+ [
# dates/datetimes are converted to durations, which seems
# questionable
pd.Timedelta(datetime(2023, 7, 21 + 1) - datetime(1899, 12, 31)),
pd.Timedelta(datetime(2023, 7, 21 + 1, 12, 20, 0) - datetime(1899, 12, 31)),
pd.Timedelta(hours=12, minutes=20),
]
+ [pd.NaT] * 5,
),
],
)
def test_to_arrow_with_errors(
dtype: fastexcel.DType,
expected_data: list[Any],
):
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
rb, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": dtype}).to_arrow_with_errors()
pd_df = rb.to_pandas()
# For string columns in pandas 3, replace pd.NA with None for comparison
if dtype == "string":
column_values = pd_df["Column"].replace([np.nan, pd.NA], None).to_list()
else:
column_values = pd_df["Column"].replace(np.nan, None).to_list()
assert column_values == expected_data
def item_to_polars(item: Any):
if isinstance(item, pd.Timestamp):
return item.to_pydatetime()
if pd.isna(item):
return None
return item
pl_df = pl.from_arrow(rb)
assert isinstance(pl_df, pl.DataFrame)
pl_expected_data = list(map(item_to_polars, expected_data))
assert pl_df["Column"].to_list() == pl_expected_data
# the only empty cell is (0, 0), so all other cells that were read as None
# should be errors
expected_error_positions = [
(i, 0) for i in range(1, len(expected_data)) if expected_data[i] in {None, pd.NaT}
]
if expected_error_positions:
assert cell_errors is not None
error_positions = [err.offset_position for err in cell_errors.errors]
assert error_positions == expected_error_positions
def test_guess_dtypes_with_div0_error() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("div0.xlsx"))
sheet = excel_reader.load_sheet(0)
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="dividend",
index=0,
absolute_index=0,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="divisor",
index=1,
absolute_index=1,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="quotient",
index=2,
absolute_index=2,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
]
expected_data = {
"dividend": [42.0, 43.0, 44.0, 45.0],
"divisor": [0.0, 1.0, 2.0, 3.0],
"quotient": [None, 43.0, 22.0, 15.0],
}
pd_df = sheet.to_pandas()
pd_expected_data = pd.DataFrame(expected_data)
pd_assert_frame_equal(pd_df, pd_expected_data)
pl_df = sheet.to_polars()
pl_expected_data = pl.DataFrame(expected_data)
pl_assert_frame_equal(pl_df, pl_expected_data)
================================================
FILE: python/tests/test_durations.py
================================================
from __future__ import annotations
from datetime import date, datetime, timedelta
import fastexcel
import numpy as np
import pandas as pd
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.datatypes import DataType as PolarsDataType
from polars.datatypes import Date as PlDate
from polars.datatypes import Datetime as PlDateTime
from polars.datatypes import Duration as PlDuration
from polars.datatypes import Utf8 as PlUtf8
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import get_expected_pandas_dtype, path_for_fixture
def test_sheet_with_different_time_types() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))
sheet = excel_reader.load_sheet_by_idx(0)
pd_df = sheet.to_pandas()
pl_df = sheet.to_polars()
## dtypes
assert pd_df["date"].dtype == np.dtype("object")
assert pd_df["datestr"].dtype == get_expected_pandas_dtype("string")
assert pd_df["time"].dtype == np.dtype("timedelta64[ms]")
assert pd_df["datetime"].dtype == np.dtype("datetime64[ms]")
expected_pl_dtypes: dict[str, PolarsDataType] = {
"date": PlDate(),
"datestr": PlUtf8(),
"time": PlDuration(time_unit="ms"),
"datetime": PlDateTime(time_unit="ms", time_zone=None),
}
assert dict(zip(pl_df.columns, pl_df.dtypes)) == expected_pl_dtypes
## Contents
expected_pd = pd.DataFrame(
{
"date": [date(2023, 6, 1)],
"datestr": ["2023-06-01T02:03:04+02:00"],
"time": pd.Series([pd.to_timedelta("01:02:03")]).astype("timedelta64[ms]"),
"datetime": pd.Series([pd.to_datetime("2023-06-01 02:03:04")]).astype("datetime64[ms]"),
}
)
expected_pl = pl.DataFrame(
{
"date": [date(2023, 6, 1)],
"datestr": ["2023-06-01T02:03:04+02:00"],
"time": [timedelta(hours=1, minutes=2, seconds=3)],
"datetime": [datetime(2023, 6, 1, 2, 3, 4)],
},
schema=expected_pl_dtypes,
)
pd_assert_frame_equal(pd_df, expected_pd)
pl_assert_frame_equal(pl_df, expected_pl)
def test_sheet_with_offset_header_row_and_durations() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("single-sheet-skip-rows-durations.xlsx"))
sheet = excel_reader.load_sheet(0, header_row=10)
pd_df = sheet.to_pandas()
pl_df = sheet.to_polars()
assert pd_df["Tot. Time Away From System"].dtype == np.dtype("timedelta64[ms]")
assert pd_df["Tot. Time Away From System"].tolist() == [
pd.Timedelta("01:18:43"),
pd.Timedelta("07:16:51"),
]
assert pl_df["Tot. Time Away From System"].dtype == pl.Duration(time_unit="ms")
assert pl_df["Tot. Time Away From System"].to_list() == [
timedelta(hours=1, minutes=18, seconds=43),
timedelta(hours=7, minutes=16, seconds=51),
]
================================================
FILE: python/tests/test_eagerness.py
================================================
from datetime import date, datetime, timedelta
import fastexcel
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from pyarrow import RecordBatch
from .utils import path_for_fixture
def test_load_sheet_eager_single_sheet() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
lazy_pandas = excel_reader.load_sheet(0).to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet(0).to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)
def test_multiple_sheets_with_unnamed_columns():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)
def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None:
ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))
record_batch = ods_reader.load_sheet_eager(0)
assert isinstance(record_batch, RecordBatch)
pl_df = pl.from_arrow(record_batch)
assert isinstance(pl_df, pl.DataFrame)
pl_assert_frame_equal(
pl_df,
pl.DataFrame(
{
"date": [date(2023, 6, 1)],
"datestr": ["2023-06-01T02:03:04+02:00"],
"time": [timedelta(hours=1, minutes=2, seconds=3)],
"datetime": [datetime(2023, 6, 1, 2, 3, 4)],
}
).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))),
)
================================================
FILE: python/tests/test_empty.py
================================================
import fastexcel
import pytest
from .utils import path_for_fixture
@pytest.mark.parametrize("path", ("empty.ods", "empty.xlsx"))
def test_empty(path: str) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture(path))
sheet = excel_reader.load_sheet_by_idx(0)
assert sheet.to_pandas().empty
assert sheet.to_polars().is_empty()
================================================
FILE: python/tests/test_errors.py
================================================
from __future__ import annotations
import fastexcel
import pytest
from .utils import path_for_fixture
def test_cell_error_repr() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-type-errors.xlsx"))
_, cell_errors = excel_reader.load_sheet(0, dtypes={"Column": "int"}).to_arrow_with_errors()
assert cell_errors is not None
assert (
repr(cell_errors.errors[0])
== """CellError(position=(2, 0), offset_position=(1, 0), row_offset=1, detail="Expected int but got 'String(\\"foo\\")'")""" # noqa: E501
)
def test_read_excel_bad_type() -> None:
expected_message = "source must be a string or bytes"
with pytest.raises(fastexcel.InvalidParametersError, match=expected_message):
fastexcel.read_excel(42) # type: ignore[arg-type]
def test_does_not_exist() -> None:
expected_message = """calamine error: Cannot detect file format
Context:
0: Could not open workbook at path_does_not_exist.nope
1: could not load excel file at path_does_not_exist.nope"""
with pytest.raises(fastexcel.CalamineError, match=expected_message) as exc_info:
fastexcel.read_excel("path_does_not_exist.nope")
assert exc_info.value.__doc__ == "Generic calamine error"
# Should also work with the base error type
with pytest.raises(fastexcel.FastExcelError, match=expected_message):
fastexcel.read_excel("path_does_not_exist.nope")
def test_sheet_idx_not_found_error() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
expected_message = """sheet at index 42 not found
Context:
0: Sheet index 42 is out of range. File has 1 sheets."""
with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info:
excel_reader.load_sheet(42)
assert exc_info.value.__doc__ == "Sheet was not found"
# Should also work with the base error type
with pytest.raises(fastexcel.FastExcelError, match=expected_message):
excel_reader.load_sheet(42)
def test_sheet_name_not_found_error() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
expected_message = """sheet with name "idontexist" not found
Context:
0: Sheet "idontexist" not found in file. Available sheets: "January"."""
with pytest.raises(fastexcel.SheetNotFoundError, match=expected_message) as exc_info:
excel_reader.load_sheet("idontexist")
assert exc_info.value.__doc__ == "Sheet was not found"
@pytest.mark.parametrize(
"exc_class, expected_docstring",
[
(fastexcel.FastExcelError, "The base class for all fastexcel errors"),
(
fastexcel.UnsupportedColumnTypeCombinationError,
"Column contains an unsupported type combination",
),
(fastexcel.CannotRetrieveCellDataError, "Data for a given cell cannot be retrieved"),
(
fastexcel.CalamineCellError,
"calamine returned an error regarding the content of the cell",
),
(fastexcel.CalamineError, "Generic calamine error"),
(fastexcel.ColumnNotFoundError, "Column was not found"),
(fastexcel.SheetNotFoundError, "Sheet was not found"),
(fastexcel.ArrowError, "Generic arrow error"),
(fastexcel.InvalidParametersError, "Provided parameters are invalid"),
],
)
def test_docstrings(exc_class: type[Exception], expected_docstring: str) -> None:
assert exc_class.__doc__ == expected_docstring
def test_schema_sample_rows_must_be_nonzero() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
with pytest.raises(
fastexcel.InvalidParametersError,
match="schema_sample_rows cannot be 0, as it would prevent dtype inferring",
):
excel_reader.load_sheet(0, schema_sample_rows=0)
with pytest.raises(
fastexcel.InvalidParametersError,
match="schema_sample_rows cannot be 0, as it would prevent dtype inferring",
):
excel_reader.load_table("my-table", schema_sample_rows=0)
================================================
FILE: python/tests/test_fastexcel.py
================================================
from __future__ import annotations
from datetime import datetime
from typing import Any
import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import path_for_fixture
def test_single_sheet():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
assert excel_reader.sheet_names == ["January"]
sheet_by_name = excel_reader.load_sheet("January")
sheet_by_idx = excel_reader.load_sheet(0)
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "January"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 2
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_single_sheet_bytes():
with open(path_for_fixture("fixture-single-sheet.xlsx"), "rb") as f:
excel_reader = fastexcel.read_excel(f.read())
assert excel_reader.sheet_names == ["January"]
sheet_by_name = excel_reader.load_sheet("January")
sheet_by_idx = excel_reader.load_sheet(0)
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "January"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 2
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_single_sheet_with_types():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
sheet = excel_reader.load_sheet(0)
assert sheet.name == "Sheet1"
assert sheet.height == sheet.total_height == 3
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"__UNNAMED__0": [0.0, 1.0, 2.0],
"bools": [True, False, True],
"dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
"floats": [12.35, 42.69, 1234567],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"__UNNAMED__0": [0.0, 1.0, 2.0],
"bools": [True, False, True],
"dates": ["2022-03-02 05:43:04"] * 3,
"floats": [12.35, 42.69, 1234567],
}
).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
def test_multiple_sheets():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
assert excel_reader.sheet_names == ["January", "February", "With unnamed columns"]
pd_assert_frame_equal(
excel_reader.load_sheet_by_idx(0).to_pandas(),
pd.DataFrame({"Month": [1.0], "Year": [2019.0]}),
)
pd_assert_frame_equal(
excel_reader.load_sheet_by_idx(1).to_pandas(),
pd.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}),
)
pd_assert_frame_equal(
excel_reader.load_sheet_by_name("With unnamed columns").to_pandas(),
pd.DataFrame(
{
"col1": [2.0, 3.0],
"__UNNAMED__1": [1.5, 2.5],
"col3": ["hello", "world"],
"__UNNAMED__3": [-5.0, -6.0],
"col5": ["a", "b"],
}
),
)
pl_assert_frame_equal(
excel_reader.load_sheet_by_idx(0).to_polars(),
pl.DataFrame({"Month": [1.0], "Year": [2019.0]}),
)
pl_assert_frame_equal(
excel_reader.load_sheet_by_idx(1).to_polars(),
pl.DataFrame({"Month": [2.0, 3.0, 4.0], "Year": [2019.0, 2021.0, 2022.0]}),
)
pl_assert_frame_equal(
excel_reader.load_sheet_by_name("With unnamed columns").to_polars(),
pl.DataFrame(
{
"col1": [2.0, 3.0],
"__UNNAMED__1": [1.5, 2.5],
"col3": ["hello", "world"],
"__UNNAMED__3": [-5.0, -6.0],
"col5": ["a", "b"],
}
),
)
def test_sheets_with_header_line_diff_from_zero():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet("Sheet1", header_row=1)
sheet_by_idx = excel_reader.load_sheet(0, header_row=1)
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet1"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 2
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_sheets_with_no_header():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None)
sheet_by_idx = excel_reader.load_sheet(1, header_row=None)
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 3
expected = {
"__UNNAMED__0": [1.0, 2.0],
"__UNNAMED__1": [3.0, 4.0],
"__UNNAMED__2": [5.0, 6.0],
}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_sheets_with_empty_rows_before_header():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet("Sheet3")
sheet_by_idx = excel_reader.load_sheet(2)
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet3"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 2
expected = {"Month": [1.0, 2.0], "Year": [2019.0, 2020.0]}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_sheets_with_custom_headers():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet(
"Sheet2", header_row=None, column_names=["foo", "bar", "baz"]
)
sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["foo", "bar", "baz"])
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 3
expected = {"foo": [1.0, 2.0], "bar": [3.0, 4.0], "baz": [5.0, 6.0]}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_sheets_with_skipping_headers():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-changing-header-location.xlsx"))
assert excel_reader.sheet_names == ["Sheet1", "Sheet2", "Sheet3"]
sheet_by_name = excel_reader.load_sheet("Sheet2", header_row=None, column_names=["Bugs"])
sheet_by_idx = excel_reader.load_sheet(1, header_row=None, column_names=["Bugs"])
# Metadata
assert sheet_by_name.name == sheet_by_idx.name == "Sheet2"
assert sheet_by_name.height == sheet_by_idx.height == 2
assert sheet_by_name.width == sheet_by_idx.width == 3
expected = {
"Bugs": [1.0, 2.0],
"__UNNAMED__1": [3.0, 4.0],
"__UNNAMED__2": [5.0, 6.0],
}
pd_expected = pd.DataFrame(expected)
pd_assert_frame_equal(sheet_by_name.to_pandas(), pd_expected)
pd_assert_frame_equal(sheet_by_idx.to_pandas(), pd_expected)
pl_expected = pl.DataFrame(expected)
pl_assert_frame_equal(sheet_by_name.to_polars(), pl_expected)
pl_assert_frame_equal(sheet_by_idx.to_polars(), pl_expected)
def test_sheet_with_pagination():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
sheet = excel_reader.load_sheet(0, skip_rows=1, n_rows=1)
assert sheet.name == "Sheet1"
assert sheet.height == 1
assert sheet.total_height == 3
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"__UNNAMED__0": [1.0],
"bools": [False],
"dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"),
"floats": [42.69],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"__UNNAMED__0": [1.0],
"bools": [False],
"dates": ["2022-03-02 05:43:04"],
"floats": [42.69],
}
).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
def test_sheet_with_skip_rows():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
sheet = excel_reader.load_sheet(0, skip_rows=1)
assert sheet.name == "Sheet1"
assert sheet.height == 2
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"__UNNAMED__0": [1.0, 2.0],
"bools": [False, True],
"dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 2).astype(
"datetime64[ms]"
),
"floats": [42.69, 1234567],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"__UNNAMED__0": [1.0, 2.0],
"bools": [False, True],
"dates": ["2022-03-02 05:43:04"] * 2,
"floats": [42.69, 1234567],
}
).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
def test_sheet_with_n_rows():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
sheet = excel_reader.load_sheet(0, n_rows=1)
assert sheet.name == "Sheet1"
assert sheet.height == 1
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"__UNNAMED__0": [0.0],
"bools": [True],
"dates": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype("datetime64[ms]"),
"floats": [12.35],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"__UNNAMED__0": [0.0],
"bools": [True],
"dates": ["2022-03-02 05:43:04"],
"floats": [12.35],
}
).with_columns(pl.col("dates").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")),
)
def test_sheet_with_pagination_and_without_headers():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
sheet = excel_reader.load_sheet(
0,
n_rows=1,
skip_rows=1,
header_row=None,
column_names=["This", "Is", "Amazing", "Stuff"],
)
assert sheet.name == "Sheet1"
assert sheet.height == 1
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"This": [0.0],
"Is": [True],
"Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")]).astype(
"datetime64[ms]"
),
"Stuff": [12.35],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"This": [0.0],
"Is": [True],
"Amazing": ["2022-03-02 05:43:04"],
"Stuff": [12.35],
}
).with_columns(
pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)
def test_sheet_with_pagination_out_of_bound():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))
assert excel_reader.sheet_names == ["Sheet1"]
with pytest.raises(
fastexcel.InvalidParametersError, match="Too many rows skipped. Max height is 4"
):
excel_reader.load_sheet(
0,
skip_rows=1000000,
header_row=None,
column_names=["This", "Is", "Amazing", "Stuff"],
)
sheet = excel_reader.load_sheet(
0,
n_rows=1000000,
skip_rows=1,
header_row=None,
column_names=["This", "Is", "Amazing", "Stuff"],
)
assert sheet.name == "Sheet1"
assert sheet.height == 3
assert sheet.width == 4
pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"This": [0.0, 1.0, 2.0],
"Is": [True, False, True],
"Amazing": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
"Stuff": [12.35, 42.69, 1234567],
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"This": [0.0, 1.0, 2.0],
"Is": [True, False, True],
"Amazing": ["2022-03-02 05:43:04"] * 3,
"Stuff": [12.35, 42.69, 1234567],
}
).with_columns(
pl.col("Amazing").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)
def test_sheet_with_na():
"""Test reading a sheet with #N/A cells. For now, we consider them as null"""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx"))
sheet = excel_reader.load_sheet(0)
assert sheet.name == "Sheet1"
assert sheet.height == sheet.total_height == 2
assert sheet.width == 2
expected = {
"Title": ["A", "B"],
"Amount": [None, 100.0],
}
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_sheet_with_ref():
"""Test reading a sheet with #REF! cells. For now, we consider them as null"""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-na.xlsx"))
sheet = excel_reader.load_sheet("Broken refs")
assert sheet.name == "Broken refs"
assert sheet.height == sheet.total_height == 2
assert sheet.width == 1
expected = {"numbers": [1.0, None]}
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
@pytest.mark.parametrize("excel_file", ["sheet-null-strings.xlsx", "sheet-null-strings-empty.xlsx"])
def test_null_strings(excel_file: str, expected_data_sheet_null_strings: dict[str, list[Any]]):
excel_reader = fastexcel.read_excel(path_for_fixture(excel_file))
sheet = excel_reader.load_sheet(0)
assert sheet.height == sheet.total_height == 10
assert sheet.width == 6
pd_df = pd.DataFrame(expected_data_sheet_null_strings)
pd_df["DATES_AND_NULLS"] = pd_df["DATES_AND_NULLS"].dt.as_unit("ms")
pd_df["TIMESTAMPS_AND_NULLS"] = pd_df["TIMESTAMPS_AND_NULLS"].dt.as_unit("ms")
pd_assert_frame_equal(sheet.to_pandas(), pd_df)
pl_df = pl.DataFrame(expected_data_sheet_null_strings).with_columns(
pl.col("DATES_AND_NULLS").dt.cast_time_unit("ms"),
pl.col("TIMESTAMPS_AND_NULLS").dt.cast_time_unit("ms"),
)
pl_assert_frame_equal(sheet.to_polars(), pl_df)
def test_null_values_in_cells() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value.xlsx"))
sheet = excel_reader.load_sheet(0)
expected = {
"Title": ["A", "B", "C", "D"],
"Date": [None, None, datetime(2021, 1, 1), datetime(2021, 5, 5)],
}
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(expected).with_columns(pl.col("Date").dt.cast_time_unit("ms")),
)
pd_expected = pd.DataFrame(expected)
pd_expected["Date"] = pd_expected["Date"].dt.as_unit("ms")
pd_assert_frame_equal(sheet.to_pandas(), pd_expected)
def test_invalid_value_num() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-invalid-cell-value-num.xlsx"))
sheet = excel_reader.load_sheet(0)
expected = {"Column": [8.0, None]}
pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
def test_null_column_is_nullable() -> None:
sheet = fastexcel.read_excel(path_for_fixture("null-column.xlsx")).load_sheet(0)
assert sheet.to_arrow().schema.field("nullonly").nullable is True
def test_sheet_with_decimal_numbers() -> None:
sheet = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet(0)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame({"Decimals": [28.14, 29.02]}),
)
sheet2 = fastexcel.read_excel(path_for_fixture("decimal-numbers.xlsx")).load_sheet(
0, dtypes={0: "string"}
)
pl_assert_frame_equal(
sheet2.to_polars(),
pl.DataFrame({"Decimals": ["28.14", "29.02"]}),
)
@pytest.mark.parametrize(
"header_row, skip_rows, expected",
[
(0, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}), # default
(
None,
0,
{
"__UNNAMED__0": [None, None, "a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [None, None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(
None,
None,
{
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(
0,
0,
{
"__UNNAMED__0": [None, "a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [None, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(
0,
1,
{
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(
None,
2,
{
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(
None,
3,
{"__UNNAMED__0": ["b", "c", "d", "e", "f"], "__UNNAMED__1": [1.0, 2.0, 3.0, 4.0, 5.0]},
),
(
1,
0,
{
"__UNNAMED__0": ["a", "b", "c", "d", "e", "f"],
"__UNNAMED__1": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0],
},
),
(2, 0, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
(2, None, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
(2, 1, {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
(2, [1, 3], {"a": ["b", "d", "f"], "0": [1.0, 3.0, 5.0]}),
(2, [0], {"a": ["c", "d", "e", "f"], "0": [2.0, 3.0, 4.0, 5.0]}),
(
None,
[2, 4],
{
"__UNNAMED__0": [None, None, "b", "d", "e", "f"],
"__UNNAMED__1": [None, None, 1.0, 3.0, 4.0, 5.0],
},
),
(2, [], {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
(2, [0, 1, 2, 3], {"a": ["f"], "0": [5.0]}),
(2, lambda x: x % 2 == 0, {"a": ["c", "e"], "0": [2.0, 4.0]}),
(2, lambda x: x in [0, 4], {"a": ["c", "d", "e"], "0": [2.0, 3.0, 4.0]}),
(2, lambda x: False, {"a": ["b", "c", "d", "e", "f"], "0": [1.0, 2.0, 3.0, 4.0, 5.0]}),
(2, lambda x: x != 2, {"a": ["d"], "0": [3.0]}),
],
)
def test_header_row_and_skip_rows(
header_row: int | None, skip_rows: int, expected: dict[str, Any]
) -> None:
pl_assert_frame_equal(
fastexcel.read_excel(path_for_fixture("no-header.xlsx"))
.load_sheet(0, header_row=header_row, skip_rows=skip_rows)
.to_polars(),
pl.DataFrame(expected),
)
def test_null_bytes_in_column_names() -> None:
"""https://github.com/ToucanToco/fastexcel/issues/343"""
reader = fastexcel.read_excel(path_for_fixture("null-bytes-in-columns-names.xls"))
df = reader.load_sheet(0).to_polars()
assert df.shape == (8_763, 11)
================================================
FILE: python/tests/test_pycapsule.py
================================================
"""Tests for the Arrow PyCapsule Interface implementation."""
import fastexcel
import pandas as pd
import polars as pl
from .utils import path_for_fixture
def test_sheet_arrow_c_schema():
"""Test that __arrow_c_schema__ returns a valid PyCapsule."""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
schema_capsule = sheet.__arrow_c_schema__()
# Check it's a PyCapsule with the correct name
assert hasattr(schema_capsule, "__class__")
assert "PyCapsule" in str(type(schema_capsule))
def test_sheet_arrow_c_array():
"""Test that __arrow_c_array__ returns a tuple of PyCapsules."""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
schema_capsule, array_capsule = sheet.__arrow_c_array__()
# Check both are PyCapsules
assert "PyCapsule" in str(type(schema_capsule))
assert "PyCapsule" in str(type(array_capsule))
def test_table_arrow_c_schema():
"""Test that table __arrow_c_schema__ returns a valid PyCapsule."""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table_names = excel_reader.table_names()
table = excel_reader.load_table(table_names[0]) # Should be 'users'
schema_capsule = table.__arrow_c_schema__()
# Check it's a PyCapsule
assert "PyCapsule" in str(type(schema_capsule))
def test_table_arrow_c_array():
"""Test that table __arrow_c_array__ returns a tuple of PyCapsules."""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table_names = excel_reader.table_names()
table = excel_reader.load_table(table_names[0]) # Should be 'users'
schema_capsule, array_capsule = table.__arrow_c_array__()
# Check both are PyCapsules
assert "PyCapsule" in str(type(schema_capsule))
assert "PyCapsule" in str(type(array_capsule))
def test_pycapsule_interface_with_requested_schema():
"""Test PyCapsule interface methods with requested_schema parameter."""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
# Test with None (current implementation ignores this)
schema_capsule, array_capsule = sheet.__arrow_c_array__(None)
assert "PyCapsule" in str(type(schema_capsule))
assert "PyCapsule" in str(type(array_capsule))
def test_integration_with_polars():
"""Test that polars can consume our PyCapsule interface."""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
# Polars should be able to create a DataFrame from our PyCapsule interface
# This tests the actual interoperability
df = pl.DataFrame(sheet)
assert len(df) == 2
assert df.columns == ["Month", "Year"]
def test_to_polars_without_pyarrow():
"""Test that to_polars() works via PyCapsule interface without pyarrow."""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
# This should work via PyCapsule interface, not requiring pyarrow
df = sheet.to_polars()
assert isinstance(df, pl.DataFrame)
assert len(df) == 2
assert df.columns == ["Month", "Year"]
# Test with table as well
excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table_names = excel_reader_table.table_names()
table = excel_reader_table.load_table(table_names[0])
df_table = table.to_polars()
assert isinstance(df_table, pl.DataFrame)
def test_to_pandas_still_requires_pyarrow():
"""Test that to_pandas() currently still requires pyarrow.
Note: pandas PyCapsule interface would require implementing __dataframe__
or __arrow_c_stream__, which we don't currently do.
"""
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
sheet = excel_reader.load_sheet("January")
# This still requires pyarrow for now
df = sheet.to_pandas()
assert isinstance(df, pd.DataFrame)
assert len(df) == 2
assert list(df.columns) == ["Month", "Year"]
# Test with table as well
excel_reader_table = fastexcel.read_excel(path_for_fixture("sheet-with-tables.xlsx"))
table_names = excel_reader_table.table_names()
table = excel_reader_table.load_table(table_names[0])
df_table = table.to_pandas()
assert isinstance(df_table, pd.DataFrame)
================================================
FILE: python/tests/test_sheet_visibility.py
================================================
import fastexcel
from .utils import path_for_fixture
def test_sheet_visibilities() -> None:
file_path = path_for_fixture("fixture-sheets-different-visibilities.xlsx")
reader = fastexcel.read_excel(file_path)
assert reader.load_sheet(0).visible == "visible"
assert reader.load_sheet(1).visible == "hidden"
assert reader.load_sheet(2).visible == "veryhidden"
================================================
FILE: python/tests/test_shifted_data.py
================================================
import fastexcel
from .utils import path_for_fixture
def test_sheet_with_offset():
reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
sheet = reader.load_sheet("without-table")
assert sheet.available_columns() == [
fastexcel.ColumnInfo(
name="Column at H10",
index=0,
absolute_index=7,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="Column at I10",
index=1,
absolute_index=8,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
fastexcel.ColumnInfo(
name="__UNNAMED__2",
index=2,
absolute_index=9,
dtype="string",
dtype_from="guessed",
column_name_from="generated",
),
fastexcel.ColumnInfo(
name="Column at K10",
index=3,
absolute_index=10,
dtype="float",
dtype_from="guessed",
column_name_from="looked_up",
),
]
def test_table_with_offset():
reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-offset.xlsx"))
table = reader.load_table("TableAtD5")
assert table.available_columns() == [
fastexcel.ColumnInfo(
name="Column at D5",
index=0,
absolute_index=3,
dtype="float",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="Column at E5",
index=1,
absolute_index=4,
dtype="float",
dtype_from="guessed",
column_name_from="provided",
),
]
================================================
FILE: python/tests/test_tables.py
================================================
from datetime import datetime
import fastexcel
import pandas as pd
import polars as pl
import pytest
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import path_for_fixture
@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_table_names(path: str) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture(path))
table_names = excel_reader.table_names()
assert table_names == ["users"]
@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_table_names_with_sheet_name(path: str) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture(path))
table_names = excel_reader.table_names("sheet1")
assert table_names == ["users"]
table_names = excel_reader.table_names("sheet2")
assert table_names == []
@pytest.mark.parametrize("path", ("sheet-with-tables.xlsx",))
def test_load_table(path: str) -> None:
excel_reader = fastexcel.read_excel(path_for_fixture(path))
users_tbl = excel_reader.load_table("users")
assert users_tbl.name == "users"
assert users_tbl.sheet_name == "sheet1"
assert users_tbl.specified_dtypes is None
assert users_tbl.available_columns() == [
fastexcel.ColumnInfo(
name="User Id",
index=0,
absolute_index=0,
dtype="float",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="FirstName",
index=1,
absolute_index=1,
dtype="string",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="LastName",
index=2,
absolute_index=2,
dtype="string",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="Date",
index=3,
absolute_index=3,
dtype="datetime",
dtype_from="guessed",
column_name_from="provided",
),
]
assert users_tbl.total_height == 3
assert users_tbl.offset == 0
assert users_tbl.height == 3
assert users_tbl.width == 4
expected_pl = pl.DataFrame(
{
"User Id": [1.0, 2.0, 5.0],
"FirstName": ["Peter", "John", "Hans"],
"LastName": ["Müller", "Meier", "Fricker"],
"Date": [datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)],
}
).with_columns(pl.col("Date").dt.cast_time_unit("ms"))
pl_assert_frame_equal(users_tbl.to_polars(), expected_pl)
expected_pd = pd.DataFrame(
{
"User Id": [1.0, 2.0, 5.0],
"FirstName": ["Peter", "John", "Hans"],
"LastName": ["Müller", "Meier", "Fricker"],
"Date": pd.Series(
[datetime(2020, 1, 1), datetime(2024, 5, 4), datetime(2025, 2, 1)]
).astype("datetime64[ms]"),
}
)
pd_assert_frame_equal(users_tbl.to_pandas(), expected_pd)
table_eager = excel_reader.load_table("users", eager=True)
pl_df = pl.from_arrow(table_eager)
assert isinstance(pl_df, pl.DataFrame)
pl_assert_frame_equal(pl_df, expected_pl)
pd_assert_frame_equal(table_eager.to_pandas(), expected_pd)
================================================
FILE: python/tests/test_whitespace.py
================================================
import datetime
import fastexcel
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from .utils import path_for_fixture
def test_skip_tail_whitespace_rows() -> None:
"""Test that skip_whitespace_tail_rows option works correctly."""
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))
# Expected data when NOT skipping whitespace tail rows
expected_with_whitespace = pl.DataFrame(
{
"Column One": ["1", "2", "3", None, "5", None, None, None, None, " "],
"Column Two": ["one", "two", None, "four", "five", None, None, "", None, None],
"Column Three": [
datetime.datetime(2025, 11, 19, 14, 34, 2),
datetime.datetime(2025, 11, 20, 14, 56, 34),
datetime.datetime(2025, 11, 21, 15, 19, 6),
None,
datetime.datetime(2025, 11, 22, 15, 41, 38),
datetime.datetime(2025, 11, 23, 16, 4, 10),
None,
None,
None,
None,
],
}
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
# Expected data when skipping whitespace tail rows
expected_without_whitespace = pl.DataFrame(
{
"Column One": [1.0, 2.0, 3.0, None, 5.0, None],
"Column Two": ["one", "two", None, "four", "five", None],
"Column Three": [
datetime.datetime(2025, 11, 19, 14, 34, 2),
datetime.datetime(2025, 11, 20, 14, 56, 34),
datetime.datetime(2025, 11, 21, 15, 19, 6),
None,
datetime.datetime(2025, 11, 22, 15, 41, 38),
datetime.datetime(2025, 11, 23, 16, 4, 10),
],
}
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
# Test sheet without skipping whitespace tail rows
sheet_with_whitespace = excel_reader.load_sheet("Without Table")
pl_assert_frame_equal(sheet_with_whitespace.to_polars(), expected_with_whitespace)
# Test table without skipping whitespace tail rows
table_with_whitespace = excel_reader.load_table("Table_with_whitespace")
pl_assert_frame_equal(table_with_whitespace.to_polars(), expected_with_whitespace)
# Test sheet with skipping whitespace tail rows
sheet_without_whitespace = excel_reader.load_sheet(
"Without Table", skip_whitespace_tail_rows=True
)
pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)
# Test table with skipping whitespace tail rows
table_without_whitespace = excel_reader.load_table(
"Table_with_whitespace", skip_whitespace_tail_rows=True
)
pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)
# Also verify pandas compatibility
pd_assert_frame_equal(
sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
)
pd_assert_frame_equal(
table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
)
def test_skip_tail_rows_and_whitespace_as_null_behavior() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("sheet-and-table-with-whitespace.xlsx"))
# Expected data when converting whitespace to null but not skipping tail rows
expected_with_whitespace_as_null = pl.DataFrame(
{
# All rows should be taken into account but the space in the last row should be
# considered null
"Column One": [1.0, 2.0, 3.0, None, 5.0, None, None, None, None, None],
# All rows should be taken into account but the empty string in 8th row should be
# considered null
"Column Two": ["one", "two", None, "four", "five", None, None, None, None, None],
"Column Three": [
datetime.datetime(2025, 11, 19, 14, 34, 2),
datetime.datetime(2025, 11, 20, 14, 56, 34),
datetime.datetime(2025, 11, 21, 15, 19, 6),
None,
datetime.datetime(2025, 11, 22, 15, 41, 38),
datetime.datetime(2025, 11, 23, 16, 4, 10),
None,
None,
None,
None,
],
}
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
# Expected data when converting whitespace to null and skipping tail rows
expected_without_whitespace = pl.DataFrame(
{
"Column One": [1.0, 2.0, 3.0, None, 5.0, None],
"Column Two": ["one", "two", None, "four", "five", None],
"Column Three": [
datetime.datetime(2025, 11, 19, 14, 34, 2),
datetime.datetime(2025, 11, 20, 14, 56, 34),
datetime.datetime(2025, 11, 21, 15, 19, 6),
None,
datetime.datetime(2025, 11, 22, 15, 41, 38),
datetime.datetime(2025, 11, 23, 16, 4, 10),
],
}
).with_columns(pl.col("Column Three").dt.cast_time_unit("ms"))
# Test sheet with whitespace_as_null but not skipping tail rows
sheet_with_whitespace_as_null = excel_reader.load_sheet(
"Without Table", whitespace_as_null=True
)
pl_assert_frame_equal(
sheet_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
)
# Test table with whitespace_as_null but not skipping tail rows
table_with_whitespace_as_null = excel_reader.load_table(
"Table_with_whitespace", whitespace_as_null=True
)
pl_assert_frame_equal(
table_with_whitespace_as_null.to_polars(), expected_with_whitespace_as_null
)
# Test sheet with both whitespace_as_null and skip_whitespace_tail_rows
sheet_without_whitespace = excel_reader.load_sheet(
"Without Table", whitespace_as_null=True, skip_whitespace_tail_rows=True
)
pl_assert_frame_equal(sheet_without_whitespace.to_polars(), expected_without_whitespace)
# Test table with both whitespace_as_null and skip_whitespace_tail_rows
table_without_whitespace = excel_reader.load_table(
"Table_with_whitespace", whitespace_as_null=True, skip_whitespace_tail_rows=True
)
pl_assert_frame_equal(table_without_whitespace.to_polars(), expected_without_whitespace)
# Also verify pandas compatibility
pd_assert_frame_equal(
sheet_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
)
pd_assert_frame_equal(
sheet_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
)
pd_assert_frame_equal(
table_without_whitespace.to_pandas(), expected_without_whitespace.to_pandas()
)
pd_assert_frame_equal(
table_with_whitespace_as_null.to_pandas(), expected_with_whitespace_as_null.to_pandas()
)
================================================
FILE: python/tests/utils.py
================================================
from __future__ import annotations
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
def path_for_fixture(fixture_file: str) -> str:
return str(Path(__file__).parent.parent.parent / "tests" / "fixtures" / fixture_file)
def get_expected_pandas_dtype(fastexcel_dtype: str) -> Any:
"""Get the expected pandas dtype for a given fastexcel dtype, accounting for pandas version.
In pandas < 3.0, string columns use object dtype.
In pandas >= 3.0, string columns use StringDtype (with na_value=nan when from Arrow).
"""
pd_version = tuple(int(x) for x in pd.__version__.split(".")[:2])
dtype_map = {
"int": np.dtype("int64"),
"float": np.dtype("float64"),
"boolean": np.dtype("bool"),
"datetime": np.dtype("datetime64[ms]"),
"duration": np.dtype("timedelta64[ms]"),
}
if fastexcel_dtype in dtype_map:
return dtype_map[fastexcel_dtype]
if fastexcel_dtype == "string":
if pd_version >= (3, 0):
# When converting from Arrow, pandas uses nan as na_value
return pd.StringDtype(na_value=np.nan)
else:
return np.dtype("object")
if fastexcel_dtype == "date":
# Date columns are always object dtype
return np.dtype("object")
raise ValueError(f"Unknown fastexcel dtype: {fastexcel_dtype}")
def assert_pandas_dtypes(df: pd.DataFrame, expected_dtypes: dict[str, str]) -> None:
"""Assert that a pandas DataFrame has the expected dtypes for each column.
Args:
df: The pandas DataFrame to check
expected_dtypes: A dict mapping column names to fastexcel dtype strings
"""
for col_name, fastexcel_dtype in expected_dtypes.items():
expected_dtype = get_expected_pandas_dtype(fastexcel_dtype)
actual_dtype = df[col_name].dtype
assert actual_dtype == expected_dtype, (
f"Column '{col_name}': expected dtype {expected_dtype}, got {actual_dtype}"
)
================================================
FILE: scripts/update_versions.py
================================================
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.9"
# dependencies = []
# ///
"""Manage docs/versions.json and generate the root docs/index.html redirect."""
from __future__ import annotations
import argparse
import json
import re
from pathlib import Path
def parse_semver(version: str) -> tuple[int, ...]:
"""Extract numeric parts from a version string like 'v0.19.0'."""
return tuple(int(x) for x in re.findall(r"\d+", version))
def sort_versions(versions: list[dict]) -> list[dict]:
"""Sort: stable first, then tags descending by semver, 'latest' last."""
def sort_key(v: dict) -> tuple[int, tuple[int, ...], str]:
path = v["path"]
if v.get("stable"):
return (0, (), "")
if path == "latest":
return (2, (), "")
return (1, tuple(-x for x in parse_semver(path)), path)
return sorted(versions, key=sort_key)
def update_versions(docs_dir: Path, version: str, *, stable: bool) -> None:
if not re.fullmatch(r"latest|v\d+(\.\d+)*", version):
raise ValueError(f"Invalid version '{version}': must be 'latest' or match 'v' (e.g. v0.19.0)")
versions_file = docs_dir / "versions.json"
if versions_file.exists():
versions = json.loads(versions_file.read_text())
else:
versions = []
# Build label
if version == "latest":
label = "latest (main)"
elif stable:
label = f"{version} (stable)"
else:
label = version
# Remove old entry for this version, and clear stable flag from others if
# this one is now stable
new_versions = []
for v in versions:
if v["path"] == version:
continue
if stable and v.get("stable"):
v = {**v, "stable": False, "label": v["path"]}
new_versions.append(v)
new_versions.append({"label": label, "path": version, "stable": stable})
new_versions = sort_versions(new_versions)
versions_file.write_text(json.dumps(new_versions, indent=2) + "\n")
# Generate root redirect
stable_entry = next((v for v in new_versions if v.get("stable")), None)
redirect_path = stable_entry["path"] if stable_entry else version
index_html = docs_dir / "index.html"
index_html.write_text(
f"""\
Redirecting to {redirect_path} documentation...
"""
)
def main() -> None:
parser = argparse.ArgumentParser(description="Update docs versions.json")
parser.add_argument("--version", required=True, help="Version name (e.g. v0.19.0 or latest)")
parser.add_argument("--stable", action="store_true", help="Mark this version as the stable default")
parser.add_argument("--docs-dir", default="docs", help="Path to the docs directory")
args = parser.parse_args()
update_versions(Path(args.docs_dir), args.version, stable=args.stable)
if __name__ == "__main__":
main()
================================================
FILE: src/data/cell_extractors.rs
================================================
use calamine::{CellType, DataType};
use chrono::{NaiveDate, NaiveDateTime, TimeDelta};
use crate::types::dtype::excel_float_to_string;
pub(super) fn extract_boolean(cell: &DT) -> Option {
if let Some(b) = cell.get_bool() {
Some(b)
} else if let Some(i) = cell.get_int() {
Some(i != 0)
}
// clippy formats else if let Some(blah) = ... { Some(x) } else { None } to the .map form
else {
cell.get_float().map(|f| f != 0.0)
}
}
pub(super) fn extract_int(cell: &DT) -> Option {
cell.as_i64()
}
pub(super) fn extract_float(cell: &DT) -> Option {
cell.as_f64()
}
pub(super) fn extract_string(cell: &DT) -> Option {
if cell.is_string() {
cell.get_string().map(str::to_string)
} else if cell.is_datetime() {
cell.get_datetime()
.and_then(|dt| dt.as_datetime())
.map(|dt| dt.to_string())
} else if cell.is_datetime_iso() {
cell.get_datetime_iso().map(str::to_string)
} else if cell.is_bool() {
cell.get_bool().map(|v| v.to_string())
} else if cell.is_float() {
cell.get_float().map(excel_float_to_string)
} else {
cell.as_string()
}
}
pub(super) fn extract_date(cell: &DT) -> Option {
cell.as_date()
}
#[cfg(feature = "python")]
const EPOCH: NaiveDate = NaiveDate::from_ymd_opt(1970, 1, 1).expect("Failed to create EPOCH");
#[cfg(feature = "python")]
pub(super) fn extract_date_as_num_days(cell: &DT) -> Option {
extract_date(cell)
.and_then(|date| i32::try_from(date.signed_duration_since(EPOCH).num_days()).ok())
}
pub(super) fn extract_datetime(cell: &DT) -> Option {
cell.as_datetime()
}
#[cfg(feature = "python")]
pub(super) fn extract_datetime_as_timestamp_ms(cell: &DT) -> Option {
extract_datetime(cell).map(|dt| dt.and_utc().timestamp_millis())
}
pub(super) fn extract_duration(cell: &DT) -> Option {
cell.as_duration()
}
#[cfg(feature = "python")]
pub(super) fn extract_duration_as_ms(cell: &DT) -> Option {
extract_duration(cell).map(|d| d.num_milliseconds())
}
================================================
FILE: src/data/mod.rs
================================================
mod cell_extractors;
#[cfg(feature = "python")]
mod python;
mod rust;
use chrono::{Duration, NaiveDate, NaiveDateTime};
#[cfg(feature = "python")]
pub(crate) use python::*;
use calamine::{CellType, Data as CalData, DataRef as CalDataRef, DataType, Range};
use crate::{
data::rust::{
create_boolean_vec, create_date_vec, create_datetime_vec, create_duration_vec,
create_float_vec, create_int_vec, create_string_vec,
},
error::{FastExcelErrorKind, FastExcelResult},
types::{
dtype::{DType, DTypeCoercion, get_dtype_for_column},
excelsheet::{SkipRows, column_info::ColumnInfo},
},
};
#[derive(Debug)]
pub(crate) enum ExcelSheetData<'r> {
Owned(Range),
Ref(Range>),
}
impl ExcelSheetData<'_> {
pub(crate) fn width(&self) -> usize {
match self {
ExcelSheetData::Owned(range) => range.width(),
ExcelSheetData::Ref(range) => range.width(),
}
}
pub(crate) fn height(&self) -> usize {
match self {
ExcelSheetData::Owned(range) => range.height(),
ExcelSheetData::Ref(range) => range.height(),
}
}
pub(super) fn get_as_string(&self, pos: (usize, usize)) -> Option {
match self {
ExcelSheetData::Owned(range) => range.get(pos).and_then(|data| data.as_string()),
ExcelSheetData::Ref(range) => range.get(pos).and_then(|data| data.as_string()),
}
}
pub(crate) fn dtype_for_column(
&self,
start_row: usize,
end_row: usize,
col: usize,
dtype_coercion: &DTypeCoercion,
whitespace_as_null: bool,
) -> FastExcelResult {
match self {
ExcelSheetData::Owned(data) => get_dtype_for_column(
data,
start_row,
end_row,
col,
dtype_coercion,
whitespace_as_null,
),
ExcelSheetData::Ref(data) => get_dtype_for_column(
data,
start_row,
end_row,
col,
dtype_coercion,
whitespace_as_null,
),
}
}
pub(crate) fn height_without_tail_whitespace(&self) -> usize {
match self {
ExcelSheetData::Owned(data) => {
height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
}
ExcelSheetData::Ref(data) => {
height_without_tail_whitespace(data).unwrap_or_else(|| data.height())
}
}
}
pub(crate) fn start(&self) -> Option<(usize, usize)> {
let start = match self {
ExcelSheetData::Owned(range) => range.start(),
ExcelSheetData::Ref(range) => range.start(),
};
start.map(|(r, c)| (r as usize, c as usize))
}
}
impl From> for ExcelSheetData<'_> {
fn from(range: Range) -> Self {
Self::Owned(range)
}
}
impl<'a> From>> for ExcelSheetData<'a> {
fn from(range: Range>) -> Self {
Self::Ref(range)
}
}
trait CellIsWhiteSpace {
fn is_whitespace(&self) -> bool;
}
impl CellIsWhiteSpace for T
where
T: DataType,
{
fn is_whitespace(&self) -> bool {
if self.is_empty() {
true
} else if self.is_string()
&& let Some(s) = self.get_string()
{
s.trim().is_empty()
} else {
false
}
}
}
pub(crate) fn height_without_tail_whitespace(
data: &Range,
) -> Option {
let height = data.height();
let width = data.width();
if height < 1 {
return Some(0);
}
if width < 1 {
return None;
}
(0..width)
.map(|col_idx| {
let mut row_idx = height - 1;
// Start at the bottom of the column and work upwards until we find a non-empty cell
while row_idx > 0
&& data
.get((row_idx, col_idx))
.map(CellIsWhiteSpace::is_whitespace)
.unwrap_or(true)
{
row_idx -= 1;
}
row_idx + 1
})
.max()
}
/// A container for a typed vector of values. Used to represent a column of data in an Excel sheet.
/// These should only be used when you need to work on the raw data. Otherwise, you should use a
/// `FastExcelColumn`.
#[derive(Debug, Clone, PartialEq)]
pub enum FastExcelSeries {
Null,
Bool(Vec