Repository: nils-braun/dask-sql
Branch: main
Commit: 775b56fb8f99
Files: 251
Total size: 1.3 MB
Directory structure:
gitextract_q18bbzy0/
├── .cargo/
│ └── config.toml
├── .coveragerc
├── .dockerignore
├── .github/
│ ├── CODEOWNERS
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── documentation-request.md
│ │ ├── feature_request.md
│ │ └── submit-question.md
│ ├── dependabot.yml
│ └── workflows/
│ ├── conda.yml
│ ├── docker.yml
│ ├── release.yml
│ ├── rust.yml
│ ├── style.yml
│ ├── test-upstream.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Cargo.toml
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── conftest.py
├── continuous_integration/
│ ├── docker/
│ │ ├── cloud.dockerfile
│ │ ├── conda.txt
│ │ └── main.dockerfile
│ ├── environment-3.10.yaml
│ ├── environment-3.11.yaml
│ ├── environment-3.12.yaml
│ ├── environment-3.9.yaml
│ ├── gpuci/
│ │ ├── environment-3.10.yaml
│ │ ├── environment-3.11.yaml
│ │ └── environment-3.9.yaml
│ ├── recipe/
│ │ ├── build.sh
│ │ ├── conda_build_config.yaml
│ │ ├── meta.yaml
│ │ └── run_test.py
│ └── scripts/
│ ├── startup_script.py
│ └── update-dependencies.sh
├── dask_sql/
│ ├── __init__.py
│ ├── _compat.py
│ ├── cmd.py
│ ├── config.py
│ ├── context.py
│ ├── datacontainer.py
│ ├── input_utils/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── convert.py
│ │ ├── dask.py
│ │ ├── hive.py
│ │ ├── intake.py
│ │ ├── location.py
│ │ ├── pandaslike.py
│ │ └── sqlalchemy.py
│ ├── integrations/
│ │ ├── __init__.py
│ │ ├── fugue.py
│ │ └── ipython.py
│ ├── mappings.py
│ ├── physical/
│ │ ├── __init__.py
│ │ ├── rel/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── convert.py
│ │ │ ├── custom/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alter.py
│ │ │ │ ├── analyze_table.py
│ │ │ │ ├── create_catalog_schema.py
│ │ │ │ ├── create_experiment.py
│ │ │ │ ├── create_memory_table.py
│ │ │ │ ├── create_model.py
│ │ │ │ ├── create_table.py
│ │ │ │ ├── describe_model.py
│ │ │ │ ├── distributeby.py
│ │ │ │ ├── drop_model.py
│ │ │ │ ├── drop_schema.py
│ │ │ │ ├── drop_table.py
│ │ │ │ ├── export_model.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── predict_model.py
│ │ │ │ ├── show_columns.py
│ │ │ │ ├── show_models.py
│ │ │ │ ├── show_schemas.py
│ │ │ │ ├── show_tables.py
│ │ │ │ ├── use_schema.py
│ │ │ │ └── wrappers.py
│ │ │ └── logical/
│ │ │ ├── __init__.py
│ │ │ ├── aggregate.py
│ │ │ ├── cross_join.py
│ │ │ ├── empty.py
│ │ │ ├── explain.py
│ │ │ ├── filter.py
│ │ │ ├── join.py
│ │ │ ├── limit.py
│ │ │ ├── project.py
│ │ │ ├── sample.py
│ │ │ ├── sort.py
│ │ │ ├── subquery_alias.py
│ │ │ ├── table_scan.py
│ │ │ ├── union.py
│ │ │ ├── values.py
│ │ │ └── window.py
│ │ ├── rex/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── convert.py
│ │ │ └── core/
│ │ │ ├── __init__.py
│ │ │ ├── alias.py
│ │ │ ├── call.py
│ │ │ ├── input_ref.py
│ │ │ ├── literal.py
│ │ │ └── subquery.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── filter.py
│ │ ├── groupby.py
│ │ ├── ml_classes.py
│ │ ├── sort.py
│ │ └── statistics.py
│ ├── server/
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── presto_jdbc.py
│ │ └── responses.py
│ ├── sql-schema.yaml
│ ├── sql.yaml
│ └── utils.py
├── docs/
│ ├── Makefile
│ ├── environment.yml
│ ├── make.bat
│ ├── requirements-docs.txt
│ └── source/
│ ├── api.rst
│ ├── best_practices.rst
│ ├── cmd.rst
│ ├── conf.py
│ ├── configuration.rst
│ ├── custom.rst
│ ├── data_input.rst
│ ├── fugue.rst
│ ├── how_does_it_work.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── machine_learning.rst
│ ├── quickstart.rst
│ ├── server.rst
│ ├── sql/
│ │ ├── creation.rst
│ │ ├── describe.rst
│ │ ├── ml.rst
│ │ └── select.rst
│ └── sql.rst
├── notebooks/
│ ├── Custom Functions.ipynb
│ ├── Feature Overview.ipynb
│ ├── FugueSQL.ipynb
│ └── iris.csv
├── pyproject.toml
├── rustfmt.toml
├── setup.cfg
├── src/
│ ├── dialect.rs
│ ├── error.rs
│ ├── expression.rs
│ ├── lib.rs
│ ├── parser.rs
│ ├── sql/
│ │ ├── column.rs
│ │ ├── exceptions.rs
│ │ ├── function.rs
│ │ ├── logical/
│ │ │ ├── aggregate.rs
│ │ │ ├── alter_schema.rs
│ │ │ ├── alter_table.rs
│ │ │ ├── analyze_table.rs
│ │ │ ├── create_catalog_schema.rs
│ │ │ ├── create_experiment.rs
│ │ │ ├── create_memory_table.rs
│ │ │ ├── create_model.rs
│ │ │ ├── create_table.rs
│ │ │ ├── describe_model.rs
│ │ │ ├── drop_model.rs
│ │ │ ├── drop_schema.rs
│ │ │ ├── drop_table.rs
│ │ │ ├── empty_relation.rs
│ │ │ ├── explain.rs
│ │ │ ├── export_model.rs
│ │ │ ├── filter.rs
│ │ │ ├── join.rs
│ │ │ ├── limit.rs
│ │ │ ├── predict_model.rs
│ │ │ ├── projection.rs
│ │ │ ├── repartition_by.rs
│ │ │ ├── show_columns.rs
│ │ │ ├── show_models.rs
│ │ │ ├── show_schemas.rs
│ │ │ ├── show_tables.rs
│ │ │ ├── sort.rs
│ │ │ ├── subquery_alias.rs
│ │ │ ├── table_scan.rs
│ │ │ ├── use_schema.rs
│ │ │ └── window.rs
│ │ ├── logical.rs
│ │ ├── optimizer/
│ │ │ ├── decorrelate_where_exists.rs
│ │ │ ├── decorrelate_where_in.rs
│ │ │ ├── dynamic_partition_pruning.rs
│ │ │ ├── join_reorder.rs
│ │ │ └── utils.rs
│ │ ├── optimizer.rs
│ │ ├── parser_utils.rs
│ │ ├── preoptimizer.rs
│ │ ├── schema.rs
│ │ ├── statement.rs
│ │ ├── table.rs
│ │ ├── types/
│ │ │ ├── rel_data_type.rs
│ │ │ └── rel_data_type_field.rs
│ │ └── types.rs
│ └── sql.rs
└── tests/
├── __init__.py
├── integration/
│ ├── __init__.py
│ ├── fixtures.py
│ ├── test_analyze.py
│ ├── test_cmd.py
│ ├── test_compatibility.py
│ ├── test_complex.py
│ ├── test_create.py
│ ├── test_distributeby.py
│ ├── test_explain.py
│ ├── test_filter.py
│ ├── test_fugue.py
│ ├── test_function.py
│ ├── test_groupby.py
│ ├── test_hive.py
│ ├── test_intake.py
│ ├── test_jdbc.py
│ ├── test_join.py
│ ├── test_model.py
│ ├── test_over.py
│ ├── test_postgres.py
│ ├── test_rex.py
│ ├── test_sample.py
│ ├── test_schema.py
│ ├── test_select.py
│ ├── test_server.py
│ ├── test_show.py
│ ├── test_sort.py
│ ├── test_sqlite.py
│ └── test_union.py
├── unit/
│ ├── __init__.py
│ ├── test_call.py
│ ├── test_config.py
│ ├── test_context.py
│ ├── test_datacontainer.py
│ ├── test_mapping.py
│ ├── test_ml_utils.py
│ ├── test_queries.py
│ ├── test_statistics.py
│ └── test_utils.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .cargo/config.toml
================================================
[target.x86_64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
[target.aarch64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
================================================
FILE: .coveragerc
================================================
[run]
omit = tests/*
branch = True
[report]
# Regexes for lines to exclude from consideration
exclude_lines =
# Have to re-enable the standard pragma
pragma: no cover
# Don't complain about missing debug-only code:
def __repr__
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
# Don't complain if non-runnable code isn't run:
if __name__ == .__main__.:
================================================
FILE: .dockerignore
================================================
node_modules
.next
================================================
FILE: .github/CODEOWNERS
================================================
# global codeowners
* @ayushdg @charlesbluca @galipremsagar
# rust codeowners
.cargo/ @ayushdg @charlesbluca @galipremsagar @jdye64
src/ @ayushdg @charlesbluca @galipremsagar @jdye64
Cargo.toml @ayushdg @charlesbluca @galipremsagar @jdye64
Cargo.lock @ayushdg @charlesbluca @galipremsagar @jdye64
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a bug report to help us improve dask-sql
title: "[BUG]"
labels: "bug, needs triage"
assignees: ''
---
**What happened**:
**What you expected to happen**:
**Minimal Complete Verifiable Example**:
```python
# Put your MCVE code here
```
**Anything else we need to know?**:
**Environment**:
- dask-sql version:
- Python version:
- Operating System:
- Install method (conda, pip, source):
================================================
FILE: .github/ISSUE_TEMPLATE/documentation-request.md
================================================
---
name: Documentation request
about: Report incorrect or needed documentation
title: "[DOC]"
labels: "documentation"
assignees: ''
---
## Report incorrect documentation
**Location of incorrect documentation**
Provide links and line numbers if applicable.
**Describe the problems or issues found in the documentation**
A clear and concise description of what you found to be incorrect.
**Steps taken to verify documentation is incorrect**
List any steps you have taken:
**Suggested fix for documentation**
Detail proposed changes to fix the documentation if you have any.
---
## Report needed documentation
**Report needed documentation**
A clear and concise description of what documentation you believe it is needed and why.
**Describe the documentation you'd like**
A clear and concise description of what you want to happen.
**Steps taken to search for needed documentation**
List any steps you have taken:
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for dask-sql
title: "[ENH]"
labels: "enhancement, needs triage"
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I wish I could use dask-sql to do [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context, code examples, or references to existing implementations about the feature request here.
================================================
FILE: .github/ISSUE_TEMPLATE/submit-question.md
================================================
---
name: Submit question
about: Ask a general question about dask-sql
title: "[QST]"
labels: "question"
assignees: ''
---
**What is your question?**
================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
- package-ecosystem: "cargo"
directory: "/"
schedule:
interval: "daily"
ignore:
# arrow and datafusion are bumped manually
- dependency-name: "arrow"
update-types: ["version-update:semver-major"]
- dependency-name: "datafusion"
update-types: ["version-update:semver-major"]
- dependency-name: "datafusion-*"
update-types: ["version-update:semver-major"]
- package-ecosystem: "github-actions"
directory: "/"
schedule:
# Check for updates to GitHub Actions every weekday
interval: "weekly"
ignore:
# prefer updating cibuildwheel manually as needed
- dependency-name: "pypa/cibuildwheel"
================================================
FILE: .github/workflows/conda.yml
================================================
name: Build conda nightly
on:
push:
branches:
- main
pull_request:
paths:
- Cargo.toml
- Cargo.lock
- pyproject.toml
- continuous_integration/recipe/**
- .github/workflows/conda.yml
schedule:
- cron: '0 0 * * 0'
# When this workflow is queued, automatically cancel any previous running
# or pending jobs from the same branch
concurrency:
group: conda-${{ github.head_ref }}
cancel-in-progress: true
# Required shell entrypoint to have properly activated conda environments
defaults:
run:
shell: bash -l {0}
jobs:
conda:
name: "Build conda nightlies (python: ${{ matrix.python }}, arch: ${{ matrix.arch }})"
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python: ["3.9", "3.10", "3.11", "3.12"]
arch: ["linux-64", "linux-aarch64"]
steps:
- name: Manage disk space
if: matrix.arch == 'linux-aarch64'
run: |
sudo mkdir -p /opt/empty_dir || true
for d in \
/opt/ghc \
/opt/hostedtoolcache \
/usr/lib/jvm \
/usr/local/.ghcup \
/usr/local/lib/android \
/usr/local/share/powershell \
/usr/share/dotnet \
/usr/share/swift \
; do
sudo rsync --stats -a --delete /opt/empty_dir/ $d || true
done
sudo apt-get purge -y -f firefox \
google-chrome-stable \
microsoft-edge-stable
sudo apt-get autoremove -y >& /dev/null
sudo apt-get autoclean -y >& /dev/null
sudo docker image prune --all --force
df -h
- name: Create swapfile
if: matrix.arch == 'linux-aarch64'
run: |
sudo fallocate -l 10GiB /swapfile || true
sudo chmod 600 /swapfile || true
sudo mkswap /swapfile || true
sudo swapon /swapfile || true
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2.3.0
with:
miniforge-variant: Mambaforge
use-mamba: true
python-version: "3.9"
channel-priority: strict
- name: Install dependencies
run: |
mamba install -c conda-forge "boa<0.17" "conda-build<24.1" conda-verify
which python
pip list
mamba list
- name: Build conda packages
run: |
# suffix for nightly package versions
export VERSION_SUFFIX=a`date +%y%m%d`
conda mambabuild continuous_integration/recipe \
--python ${{ matrix.python }} \
--variants "{target_platform: [${{ matrix.arch }}]}" \
--error-overlinking \
--no-test \
--no-anaconda-upload \
--output-folder packages
- name: Test conda packages
if: matrix.arch == 'linux-64' # can only test native platform packages
run: |
conda mambabuild --test packages/${{ matrix.arch }}/*.tar.bz2
- name: Upload conda packages as artifacts
uses: actions/upload-artifact@v3
with:
name: "conda nightlies (python - ${{ matrix.python }}, arch - ${{ matrix.arch }})"
# need to install all conda channel metadata to properly install locally
path: packages/
- name: Upload conda packages to Anaconda
if: |
github.event_name == 'push'
&& github.repository == 'dask-contrib/dask-sql'
env:
ANACONDA_API_TOKEN: ${{ secrets.DASK_CONDA_TOKEN }}
run: |
# install anaconda for upload
mamba install -c conda-forge anaconda-client
anaconda upload --label dev packages/${{ matrix.arch }}/*.tar.bz2
================================================
FILE: .github/workflows/docker.yml
================================================
name: Build Docker image
on:
release:
types: [created]
push:
branches:
- main
pull_request:
paths:
- Cargo.toml
- Cargo.lock
- pyproject.toml
- continuous_integration/docker/**
- .github/workflows/docker.yml
# When this workflow is queued, automatically cancel any previous running
# or pending jobs from the same branch
concurrency:
group: docker-${{ github.ref }}
cancel-in-progress: true
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-latest
env:
DOCKER_PUSH: ${{ contains(fromJSON('["push", "release"]'), github.event_name) && github.repository == 'dask-contrib/dask-sql' }}
strategy:
fail-fast: false
matrix:
platform: ["linux/amd64", "linux/arm64", "linux/386"]
steps:
- uses: actions/checkout@v4
- name: Login to DockerHub
if: ${{ fromJSON(env.DOCKER_PUSH) }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Docker meta for main image
id: docker_meta_main
uses: crazy-max/ghaction-docker-meta@v5
with:
images: nbraun/dask-sql
- name: Build and push main image
uses: docker/build-push-action@v5
with:
context: .
file: ./continuous_integration/docker/main.dockerfile
build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }}
platforms: ${{ matrix.platform }}
tags: ${{ steps.docker_meta_main.outputs.tags }}
labels: ${{ steps.docker_meta_main.outputs.labels }}
push: ${{ fromJSON(env.DOCKER_PUSH) }}
load: ${{ !fromJSON(env.DOCKER_PUSH) }}
- name: Check images
run: |
df -h
docker image ls
docker image inspect ${{ steps.docker_meta_main.outputs.tags }}
- name: Docker meta for cloud image
id: docker_meta_cloud
uses: crazy-max/ghaction-docker-meta@v5
with:
images: nbraun/dask-sql-cloud
- name: Build and push cloud image
uses: docker/build-push-action@v5
with:
context: .
file: ./continuous_integration/docker/cloud.dockerfile
build-args: DOCKER_META_VERSION=${{ steps.docker_meta_main.outputs.version }}
platforms: ${{ matrix.platform }}
tags: ${{ steps.docker_meta_cloud.outputs.tags }}
labels: ${{ steps.docker_meta_cloud.outputs.labels }}
push: ${{ fromJSON(env.DOCKER_PUSH) }}
load: ${{ !fromJSON(env.DOCKER_PUSH) }}
================================================
FILE: .github/workflows/release.yml
================================================
name: Upload Python package
on:
release:
types: [created]
pull_request:
paths:
- .github/workflows/release.yml
- dask_sql/__init__.py
# When this workflow is queued, automatically cancel any previous running
# or pending jobs from the same branch
concurrency:
group: release-${{ github.head_ref }}
cancel-in-progress: true
env:
upload: ${{ github.event_name == 'release' && github.repository == 'dask-contrib/dask-sql' }}
jobs:
linux:
name: Build and publish wheels for linux ${{ matrix.target }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
target: [x86_64, aarch64]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Build wheels for x86_64
if: matrix.target == 'x86_64'
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist
sccache: 'true'
manylinux: '2_17'
- name: Build wheels for aarch64
if: matrix.target == 'aarch64'
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --zig
sccache: 'true'
manylinux: '2_17'
- name: Check dist files
run: |
pip install twine
twine check dist/*
ls -lh dist/
- name: Upload binary wheels
uses: actions/upload-artifact@v3
with:
name: wheels for linux ${{ matrix.target }}
path: dist/*
- name: Publish package
if: env.upload == 'true'
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: twine upload dist/*
windows:
name: Build and publish wheels for windows
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
architecture: x64
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: x64
args: --release --out dist
sccache: 'true'
- name: Check dist files
run: |
pip install twine
twine check dist/*
ls dist/
- name: Upload binary wheels
uses: actions/upload-artifact@v3
with:
name: wheels for windows
path: dist/*
- name: Publish package
if: env.upload == 'true'
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: twine upload dist/*
macos:
name: Build and publish wheels for macos ${{ matrix.target }}
runs-on: macos-latest
strategy:
fail-fast: false
matrix:
target: [x86_64, aarch64]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist
sccache: 'true'
- name: Check dist files
run: |
pip install twine
twine check dist/*
ls -lh dist/
- name: Upload binary wheels
uses: actions/upload-artifact@v3
with:
name: wheels for macos ${{ matrix.target }}
path: dist/*
- name: Publish package
if: env.upload == 'true'
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: twine upload dist/*
sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Check dist files
run: |
pip install twine
twine check dist/*
ls -lh dist/
- name: Publish source distribution
if: env.upload == 'true'
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: twine upload dist/*
================================================
FILE: .github/workflows/rust.yml
================================================
name: Test Rust package
on:
# always trigger on PR
push:
branches:
- main
pull_request:
# manual trigger
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
workflow_dispatch:
env:
# Disable full debug symbol generation to speed up CI build and keep memory down
# "1" means line tables only, which is useful for panic tracebacks.
RUSTFLAGS: "-C debuginfo=1"
jobs:
detect-ci-trigger:
name: Check for upstream trigger phrase
runs-on: ubuntu-latest
if: github.repository == 'dask-contrib/dask-sql'
outputs:
triggered: ${{ steps.detect-trigger.outputs.trigger-found }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- uses: xarray-contrib/ci-trigger@v1.2
id: detect-trigger
with:
keyword: "[test-df-upstream]"
# Check crate compiles
linux-build-lib:
name: cargo check
needs: [detect-ci-trigger]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Cache Cargo
uses: actions/cache@v3
with:
path: /home/runner/.cargo
key: cargo-cache
- name: Optionally update upstream dependencies
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
bash continuous_integration/scripts/update-dependencies.sh
- name: Check workspace in debug mode
run: |
cargo check
- name: Check workspace in release mode
run: |
cargo check --release
# test the crate
linux-test:
name: cargo test
needs: [detect-ci-trigger]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Cache Cargo
uses: actions/cache@v3
with:
path: /home/runner/.cargo
key: cargo-cache
- name: Optionally update upstream dependencies
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
bash continuous_integration/scripts/update-dependencies.sh
- name: Run tests
run: |
cargo test
================================================
FILE: .github/workflows/style.yml
================================================
---
name: Python style check
on: [pull_request]
# When this workflow is queued, automatically cancel any previous running
# or pending jobs from the same branch
concurrency:
group: style-${{ github.head_ref }}
cancel-in-progress: true
jobs:
pre-commit:
name: Run pre-commit hooks
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
components: clippy
default: true
- uses: actions-rs/toolchain@v1
with:
toolchain: nightly
components: rustfmt
- uses: pre-commit/action@v3.0.0
================================================
FILE: .github/workflows/test-upstream.yml
================================================
name: Nightly upstream testing
on:
schedule:
- cron: "0 0 * * *" # Daily “At 00:00” UTC
workflow_dispatch: # allows you to trigger the workflow run manually
# Required shell entrypoint to have properly activated conda environments
defaults:
run:
shell: bash -l {0}
jobs:
test-dev:
name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
runs-on: ${{ matrix.os }}
env:
CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python: ["3.9", "3.10", "3.11", "3.12"]
distributed: [false]
query-planning: [true]
include:
# run tests on a distributed client
- os: "ubuntu-latest"
python: "3.9"
distributed: true
query-planning: true
- os: "ubuntu-latest"
python: "3.11"
distributed: true
query-planning: true
# run tests with query planning disabled
- os: "ubuntu-latest"
python: "3.9"
distributed: false
query-planning: false
- os: "ubuntu-latest"
python: "3.11"
distributed: false
query-planning: false
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all branches and tags.
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2.3.0
with:
miniforge-variant: Mambaforge
use-mamba: true
python-version: ${{ matrix.python }}
channel-priority: strict
activate-environment: dask-sql
environment-file: ${{ env.CONDA_FILE }}
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Install x86_64-apple-darwin target
if: matrix.os == 'macos-latest'
run: rustup target add x86_64-apple-darwin
- name: Build the Rust DataFusion bindings
run: |
maturin develop
- name: Install hive testing dependencies
if: matrix.os == 'ubuntu-latest'
run: |
docker pull bde2020/hive:2.3.2-postgresql-metastore
docker pull bde2020/hive-metastore-postgresql:2.3.0
- name: Install upstream dev Dask
run: |
mamba install --no-channel-priority dask/label/dev::dask
- name: Install pytest-reportlog
run: |
# TODO: add pytest-reportlog to testing environments if we move over to JSONL output
mamba install pytest-reportlog
- name: Test with pytest
id: run_tests
run: |
pytest --report-log test-${{ matrix.os }}-py${{ matrix.python }}-results.jsonl --cov-report=xml -n auto tests --dist loadfile
- name: Upload pytest results for failure
if: |
always()
&& steps.run_tests.outcome != 'skipped'
uses: actions/upload-artifact@v3
with:
name: test-${{ matrix.os }}-py${{ matrix.python }}-results
path: test-${{ matrix.os }}-py${{ matrix.python }}-results.jsonl
import-dev:
name: "Test importing with bare requirements and upstream dev (query-planning: ${{ matrix.query-planning }})"
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
query-planning: [true, false]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2.3.0
with:
miniforge-variant: Mambaforge
use-mamba: true
python-version: "3.9"
channel-priority: strict
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Install dependencies and nothing else
run: |
pip install -e . -vv
which python
pip list
mamba list
- name: Install upstream dev Dask
run: |
python -m pip install git+https://github.com/dask/dask
python -m pip install git+https://github.com/dask/dask-expr
python -m pip install git+https://github.com/dask/distributed
- name: Try to import dask-sql
env:
DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
run: |
python -c "import dask_sql; print('ok')"
report-failures:
name: Open issue for upstream dev failures
needs: [test-dev, import-dev]
if: |
always()
&& (
needs.test-dev.result == 'failure'
|| needs.import-dev.result == 'failure'
)
&& github.repository == 'dask-contrib/dask-sql'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v3
- name: Prepare logs & issue label
run: |
# TODO: remove this if xarray-contrib/issue-from-pytest-log no longer needs a log-path
if [ -f test-ubuntu-latest-py3.10-results/test-ubuntu-latest-py3.10-results.jsonl ]; then
cp test-ubuntu-latest-py3.10-results/test-ubuntu-latest-py3.10-results.jsonl results.jsonl
else
touch results.jsonl
fi
- name: Open or update issue on failure
uses: xarray-contrib/issue-from-pytest-log@v1.2.6
with:
log-path: results.jsonl
issue-title: ⚠️ Upstream CI failed ⚠️
issue-label: upstream
================================================
FILE: .github/workflows/test.yml
================================================
name: Test Python package
on:
push:
branches:
- main
pull_request:
# When this workflow is queued, automatically cancel any previous running
# or pending jobs from the same branch
concurrency:
group: test-${{ github.head_ref }}
cancel-in-progress: true
# Required shell entrypoint to have properly activated conda environments
defaults:
run:
shell: bash -l {0}
jobs:
detect-ci-trigger:
name: Check for upstream trigger phrase
runs-on: ubuntu-latest
if: github.repository == 'dask-contrib/dask-sql'
outputs:
triggered: ${{ steps.detect-trigger.outputs.trigger-found }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- uses: xarray-contrib/ci-trigger@v1.2
id: detect-trigger
with:
keyword: "[test-upstream]"
test:
name: "Build & Test (${{ matrix.os }}, python: ${{ matrix.python }}, distributed: ${{ matrix.distributed }}, query-planning: ${{ matrix.query-planning }})"
needs: [detect-ci-trigger]
runs-on: ${{ matrix.os }}
env:
CONDA_FILE: continuous_integration/environment-${{ matrix.python }}.yaml
DASK_SQL_DISTRIBUTED_TESTS: ${{ matrix.distributed }}
DASK_DATAFRAME__QUERY_PLANNING: ${{ matrix.query-planning }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python: ["3.9", "3.10", "3.11", "3.12"]
distributed: [false]
query-planning: [true]
include:
# run tests on a distributed client
- os: "ubuntu-latest"
python: "3.9"
distributed: true
query-planning: true
- os: "ubuntu-latest"
python: "3.11"
distributed: true
query-planning: true
# run tests with query planning disabled
- os: "ubuntu-latest"
python: "3.9"
distributed: false
query-planning: false
- os: "ubuntu-latest"
python: "3.11"
distributed: false
query-planning: false
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2.3.0
with:
miniforge-variant: Mambaforge
use-mamba: true
python-version: ${{ matrix.python }}
channel-priority: strict
activate-environment: dask-sql
environment-file: ${{ env.CONDA_FILE }}
run-post: ${{ matrix.os != 'windows-latest' && 'true' || 'false' }}
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Install x86_64-apple-darwin target
if: matrix.os == 'macos-latest'
run: rustup target add x86_64-apple-darwin
- name: Build the Rust DataFusion bindings
run: |
maturin develop
- name: Install hive testing dependencies
if: matrix.os == 'ubuntu-latest'
run: |
docker pull bde2020/hive:2.3.2-postgresql-metastore
docker pull bde2020/hive-metastore-postgresql:2.3.0
- name: Optionally install upstream dev Dask
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
mamba install --no-channel-priority dask/label/dev::dask
- name: Test with pytest
run: |
pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile
- name: Upload pytest test results
if: always()
uses: actions/upload-artifact@v3
with:
name: pytest-results
path: junit/test-results.xml
- name: Upload coverage to Codecov
if: github.repository == 'dask-contrib/dask-sql'
uses: codecov/codecov-action@v3
import:
name: "Test importing with bare requirements (query-planning: ${{ matrix.query-planning }})"
needs: [detect-ci-trigger]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
query-planning: [true, false]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: conda-incubator/setup-miniconda@v2.3.0
with:
miniforge-variant: Mambaforge
use-mamba: true
python-version: "3.9"
channel-priority: strict
- uses: actions-rs/toolchain@v1
with:
toolchain: 1.72
default: true
- name: Install dependencies and nothing else
run: |
pip install -e . -vv
which python
pip list
mamba list
- name: Optionally install upstream dev Dask
if: needs.detect-ci-trigger.outputs.triggered == 'true'
run: |
python -m pip install git+https://github.com/dask/dask
python -m pip install git+https://github.com/dask/dask-expr
python -m pip install git+https://github.com/dask/distributed
- name: Try to import dask-sql
env:
DASK_DATAFRAME_QUERY_PLANNING: ${{ matrix.query-planning }}
run: |
python -c "import dask_sql; print('ok')"
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
*.so
# Unit test / coverage reports
htmlcov/
.coverage
.coverage.*
.cache
coverage.xml
*.cover
.pytest_cache/
.hypothesis/
.pytest-html
# Jupyter Notebook
.ipynb_checkpoints
# environments
conda-env
env
venv
# IDE
.idea
.vscode
*.swp
# project specific
dask-worker-space/
node_modules/
docs/source/_build/
tests/unit/queries
tests/unit/data
target/*
packages/*
# Ignore development specific local testing files
dev_tests
dev-tests
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
language_version: python3
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args:
- "--profile"
- "black"
- repo: https://github.com/doublify/pre-commit-rust
rev: v1.0
hooks:
- id: cargo-check
args: ['--manifest-path', './Cargo.toml', '--verbose', '--']
- id: clippy
args: ['--manifest-path', './Cargo.toml', '--verbose', '--', '-D', 'warnings']
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
exclude: ^continuous_integration/recipe/
- id: check-added-large-files
- repo: local
hooks:
- id: cargo-fmt
name: cargo fmt
description: Format files with cargo fmt.
entry: cargo +nightly fmt
language: system
types: [rust]
args: ['--manifest-path', './Cargo.toml', '--verbose', '--']
================================================
FILE: .readthedocs.yaml
================================================
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-20.04
tools:
python: "mambaforge-4.10"
sphinx:
configuration: docs/source/conf.py
conda:
environment: docs/environment.yml
python:
install:
- method: pip
path: .
================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at nilslennartbraun@gmail.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to Dask-SQL
## Environment Setup
The environment used for development and CI consists of:
- a system installation of [`rustup`](https://rustup.rs/) with:
- the latest stable toolchain
- the latest nightly `rustfmt`
- a [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) environment containing all required Python packages
Once `rustup` is installed, ensure that the latest stable toolchain and nightly `rustfmt` are available by running
```
rustup toolchain install nightly -c rustfmt --profile minimal
rustup update
```
To initialize and activate the conda environment for a given Python version:
```
conda env create -f dask-sql/continuous_integration/environment-{$PYTHON_VER}.yaml
conda activate dask-sql
```
## Rust Developers Guide
Dask-SQL utilizes [Apache Arrow Datafusion](https://github.com/apache/arrow-datafusion) for parsing, planning, and optimizing SQL queries. DataFusion is written in Rust and therefore requires some Rust experience to be productive. Luckily, there are tons of great Rust learning resources on the internet. We have listed some of our favorite ones [here](#rust-learning-resources)
### Apache Arrow DataFusion
The Dask-SQL Rust codebase makes heavy use [Apache Arrow DataFusion](https://github.com/apache/arrow-datafusion). Contributors should familiarize themselves with the [codebase](https://github.com/apache/arrow-datafusion) and [documentation](https://docs.rs/datafusion/latest/datafusion/).
#### Purpose
DataFusion provides Dask-SQL with key functionality.
- Parsing SQL query strings into a `LogicalPlan` datastructure
- Future integration points with [substrait.io](https://substrait.io/)
- An optimization framework used as the baseline for creating custom highly efficient `LogicalPlan`s specific to Dask.
### Building
Building the Dask-SQL Rust codebase is a straightforward process. If you create and activate the Dask-SQL Conda environment the Rust compiler and all necessary components will be installed for you during that process and therefore requires no further manual setup.
`maturin` is used by Dask-SQL for building and bundling the resulting Rust binaries. This helps make building and installing the Rust binaries feel much more like a native Python workflow.
More details about the building setup can be found in [pyproject.toml](pyproject.toml) and [Cargo.toml](Cargo.toml)
Note that while `maturin` is used by CI and should be used during your development cycle, if the need arises to do something more specific that is not yet supported by `maturin` you can opt to use `cargo` directly from the command line.
#### Building with Python
Building Dask-SQL is straightforward with Python. To build run ```pip install .```. This will build both the Rust and Python codebase and install it into your locally activated conda environment; note that if your Rust dependencies have been updated, this command must be rerun to rebuild the Rust codebase.
#### DataFusion Modules
DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](Cargo.toml). The modules that we use currently are
- `datafusion-common` - Datastructures and core logic
- `datafusion-expr` - Expression based logic and operators
- `datafusion-sql` - SQL components such as parsing and planning
- `datafusion-optimizer` - Optimization logic and datastructures for modifying current plans into more efficient ones.
#### Retrieving Upstream Dependencies
During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules.
#### Local Documentation
Sometimes when building against the latest Github commits for DataFusion you may find that the features you are consuming do not have their documentation public yet. In this case it can be helpful to build the DataFusion documentation locally so that it can be referenced to assist with development. Here is a rough outline for building that documentation locally.
- clone https://github.com/apache/arrow-datafusion
- change into the `arrow-datafusion` directory
- run `cargo doc`
- navigate to `target/doc/datafusion/all.html` and open in your desired browser
### Datastructures
While working in the Rust codebase there are a few datastructures that you should make yourself familiar with. This section does not aim to verbosely list out all of the datastructure with in the project but rather just the key datastructures that you are likely to encounter while working on almost any feature/issue. The aim is to give you a better overview of the codebase without having to manually dig through the all the source code.
- [`PyLogicalPlan`](src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html)
- Often encountered in Python code with variable name `rel`
- Python serializable umbrella representation of the entire LogicalPlan that was generated by DataFusion
- Provides access to `DaskTable` instances and type information for each table
- Access to individual nodes in the logical plan tree. Ex: `TableScan`
- [`DaskSQLContext`](src/sql.rs)
- Analogous to Python `Context`
- Contains metadata about the tables, schemas, functions, operators, and configurations that are persent within the current execution context
- When adding custom functions/UDFs this is the location that you would register them
- Entry point for parsing SQL strings to sql node trees. This is the location Python will begin its interactions with Rust
- [`PyExpr`](src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html)
- Arguably where most of your time will be spent
- Represents a single node in sql tree. Ex: `avg(age)` from `SELECT avg(age) FROM people`
- Is associate with a single `RexType`
- Can contain literal values or represent function calls, `avg()` for example
- The expressions "index" in the tree can be retrieved by calling `PyExpr.index()` on an instance. This is useful when mapping frontend column names in Dask code to backend Dataframe columns
- Certain `PyExpr`s contain operands. Ex: `2 + 2` would contain 3 operands. 1) A literal `PyExpr` instance with value 2 2) Another literal `PyExpr` instance with a value of 2. 3) A `+` `PyExpr` representing the addition of the 2 literals.
- [`DaskSqlOptimizer`](src/sql/optimizer.rs)
- Registering location for all Dask-SQL specific logical plan optimizations
- Optimizations that are written either custom or use from another source, DataFusion, are registered here in the order they are wished to be executed
- Represents functions that modify/convert an original `PyLogicalPlan` into another `PyLogicalPlan` that would be more efficient when running in the underlying Dask framework
- [`RelDataType`](src/sql/types/rel_data_type.rs)
- Not a fan of this name, was chosen to match existing Calcite logic
- Represents a "row" in a table
- Contains a list of "columns" that are present in that row
- [RelDataTypeField](src/sql/types/rel_data_type_field.rs)
- [RelDataTypeField](src/sql/types/rel_data_type_field.rs)
- Represents an individual column in a table
- Contains:
- `qualifier` - schema the field belongs to
- `name` - name of the column/field
- `data_type` - `DaskTypeMap` instance containing information about the SQL type and underlying Arrow DataType
- `index` - location of the field in the LogicalPlan
- [DaskTypeMap](src/sql/types.rs)
- Maps a conventional SQL type to an underlying Arrow DataType
### Rust Learning Resources
- ["The Book"](https://doc.rust-lang.org/book/)
- [Lets Get Rusty "LGR" YouTube series](https://www.youtube.com/c/LetsGetRusty)
## Documentation TODO
- [ ] SQL Parsing overview diagram
- [ ] Architecture diagram
- [x] Setup dev environment
- [x] Version of Rust and specs
- [x] Updating version of datafusion
- [x] Building
- [x] Rust learning resources
- [x] Rust Datastructures local to Dask-SQL
- [x] Build DataFusion documentation locally
- [ ] Python & Rust with PyO3
- [ ] Types mapping, Arrow datatypes
- [ ] RexTypes explaination, show simple query and show it broken down into its parts in a diagram
- [ ] Registering tables with DaskSqlContext, also functions
- [ ] Creating your own optimizer
- [ ] Simple diagram of PyExpr, showing something like 2+2 but broken down into a tree looking diagram
================================================
FILE: Cargo.toml
================================================
[package]
name = "dask-sql"
repository = "https://github.com/dask-contrib/dask-sql"
version = "2024.5.0"
description = "Bindings for DataFusion used by Dask-SQL"
readme = "README.md"
license = "Apache-2.0"
edition = "2021"
rust-version = "1.72"
include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"]
[dependencies]
async-trait = "0.1.78"
datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", ref = "da6c183" }
env_logger = "0.11"
log = "^0.4"
pyo3 = { version = "0.19.2", features = ["extension-module", "abi3", "abi3-py39"] }
pyo3-log = "0.9.0"
[build-dependencies]
pyo3-build-config = "0.20.3"
[lib]
name = "dask_sql"
crate-type = ["cdylib", "rlib"]
[profile.release]
lto = true
codegen-units = 1
================================================
FILE: LICENSE.txt
================================================
MIT LICENCE
Copyright (c) 2020 Nils Braun
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: MANIFEST.in
================================================
recursive-include dask_sql *.yaml
recursive-include dask_planner *
================================================
FILE: README.md
================================================
**Dask-SQL is currently not in active maintenance, see [#1344](https://github.com/dask-contrib/dask-sql/issues/1344) for more information**
[](https://anaconda.org/conda-forge/dask-sql)
[](https://pypi.python.org/pypi/dask-sql/)
[](https://github.com/dask-contrib/dask-sql/actions/workflows/test.yml?query=branch%3Amain)
[](https://dask-sql.readthedocs.io/en/latest/)
[](https://codecov.io/gh/dask-contrib/dask-sql)
[](https://github.com/dask-contrib/dask-sql/blob/main/LICENSE.txt)
[](https://mybinder.org/v2/gh/dask-contrib/dask-sql-binder/main?urlpath=lab)
`dask-sql` is a distributed SQL query engine in Python.
It allows you to query and transform your data using a mixture of
common SQL operations and Python code and also scale up the calculation easily
if you need it.
* **Combine the power of Python and SQL**: load your data with Python, transform it with SQL, enhance it with Python and query it with SQL - or the other way round.
With `dask-sql` you can mix the well known Python dataframe API of `pandas` and `Dask` with common SQL operations, to
process your data in exactly the way that is easiest for you.
* **Infinite Scaling**: using the power of the great `Dask` ecosystem, your computations can scale as you need it - from your laptop to your super cluster - without changing any line of SQL code. From k8s to cloud deployments, from batch systems to YARN - if `Dask` [supports it](https://docs.dask.org/en/latest/setup.html), so will `dask-sql`.
* **Your data - your queries**: Use Python user-defined functions (UDFs) in SQL without any performance drawback and extend your SQL queries with the large number of Python libraries, e.g. machine learning, different complicated input formats, complex statistics.
* **Easy to install and maintain**: `dask-sql` is just a pip/conda install away (or a docker run if you prefer).
* **Use SQL from wherever you like**: `dask-sql` integrates with your jupyter notebook, your normal Python module or can be used as a standalone SQL server from any BI tool. It even integrates natively with [Apache Hue](https://gethue.com/).
* **GPU Support**: `dask-sql` supports running SQL queries on CUDA-enabled GPUs by utilizing [RAPIDS](https://rapids.ai) libraries like [`cuDF`](https://github.com/rapidsai/cudf), enabling accelerated compute for SQL.
Read more in the [documentation](https://dask-sql.readthedocs.io/en/latest/).
---
## Example
For this example, we use some data loaded from disk and query them with a SQL command from our python code.
Any pandas or dask dataframe can be used as input and ``dask-sql`` understands a large amount of formats (csv, parquet, json,...) and locations (s3, hdfs, gcs,...).
```python
import dask.dataframe as dd
from dask_sql import Context
# Create a context to hold the registered tables
c = Context()
# Load the data and register it in the context
# This will give the table a name, that we can use in queries
df = dd.read_csv("...")
c.create_table("my_data", df)
# Now execute a SQL query. The result is again dask dataframe.
result = c.sql("""
SELECT
my_data.name,
SUM(my_data.x)
FROM
my_data
GROUP BY
my_data.name
""", return_futures=False)
# Show the result
print(result)
```
## Quickstart
Have a look into the [documentation](https://dask-sql.readthedocs.io/en/latest/) or start the example notebook on [binder](https://mybinder.org/v2/gh/dask-contrib/dask-sql-binder/main?urlpath=lab).
> `dask-sql` is currently under development and does so far not understand all SQL commands (but a large fraction).
We are actively looking for feedback, improvements and contributors!
## Installation
`dask-sql` can be installed via `conda` (preferred) or `pip` - or in a development environment.
### With `conda`
Create a new conda environment or use your already present environment:
conda create -n dask-sql
conda activate dask-sql
Install the package from the `conda-forge` channel:
conda install dask-sql -c conda-forge
### With `pip`
You can install the package with
pip install dask-sql
### For development
If you want to have the newest (unreleased) `dask-sql` version or if you plan to do development on `dask-sql`, you can also install the package from sources.
git clone https://github.com/dask-contrib/dask-sql.git
Create a new conda environment and install the development environment:
conda env create -f continuous_integration/environment-3.9.yaml
It is not recommended to use `pip` instead of `conda` for the environment setup.
After that, you can install the package in development mode
pip install -e ".[dev]"
The Rust DataFusion bindings are built as part of the `pip install`.
Note that if changes are made to the Rust source in `src/`, another build must be run to recompile the bindings.
This repository uses [pre-commit](https://pre-commit.com/) hooks. To install them, call
pre-commit install
## Testing
You can run the tests (after installation) with
pytest tests
GPU-specific tests require additional dependencies specified in `continuous_integration/gpuci/environment.yaml`.
These can be added to the development environment by running
```
conda env update -n dask-sql -f continuous_integration/gpuci/environment.yaml
```
And GPU-specific tests can be run with
```
pytest tests -m gpu --rungpu
```
## SQL Server
`dask-sql` comes with a small test implementation for a SQL server.
Instead of rebuilding a full ODBC driver, we re-use the [presto wire protocol](https://github.com/prestodb/presto/wiki/HTTP-Protocol).
It is - so far - only a start of the development and missing important concepts, such as
authentication.
You can test the sql presto server by running (after installation)
dask-sql-server
or by using the created docker image
docker run --rm -it -p 8080:8080 nbraun/dask-sql
in one terminal. This will spin up a server on port 8080 (by default)
that looks similar to a normal presto database to any presto client.
You can test this for example with the default [presto client](https://prestosql.io/docs/current/installation/cli.html):
presto --server localhost:8080
Now you can fire simple SQL queries (as no data is loaded by default):
=> SELECT 1 + 1;
EXPR$0
--------
2
(1 row)
You can find more information in the [documentation](https://dask-sql.readthedocs.io/en/latest/pages/server.html).
## CLI
You can also run the CLI `dask-sql` for testing out SQL commands quickly:
dask-sql --load-test-data --startup
(dask-sql) > SELECT * FROM timeseries LIMIT 10;
## How does it work?
At the core, `dask-sql` does two things:
- translate the SQL query using [DataFusion](https://arrow.apache.org/datafusion) into a relational algebra, which is represented as a logical query plan - similar to many other SQL engines (Hive, Flink, ...)
- convert this description of the query into dask API calls (and execute them) - returning a dask dataframe.
For the first step, Arrow DataFusion needs to know about the columns and types of the dask dataframes, therefore some Rust code to store this information for dask dataframes are defined in `dask_planner`.
After the translation to a relational algebra is done (using `DaskSQLContext.logical_relational_algebra`), the python methods defined in `dask_sql.physical` turn this into a physical dask execution plan by converting each piece of the relational algebra one-by-one.
================================================
FILE: conftest.py
================================================
import dask
import pytest
pytest_plugins = ["tests.integration.fixtures"]
def pytest_addoption(parser):
parser.addoption("--rungpu", action="store_true", help="run tests meant for GPU")
parser.addoption("--runqueries", action="store_true", help="run test queries")
parser.addoption("--data_dir", help="specify file path to the data")
parser.addoption("--queries_dir", help="specify file path to the queries")
def pytest_runtest_setup(item):
# TODO: get pyarrow strings and p2p shuffle working
dask.config.set({"dataframe.convert-string": False})
dask.config.set({"dataframe.shuffle.method": "tasks"})
if "gpu" in item.keywords:
if not item.config.getoption("--rungpu"):
pytest.skip("need --rungpu option to run")
# manually enable cudf decimal support
dask.config.set({"sql.mappings.decimal_support": "cudf"})
if "queries" in item.keywords and not item.config.getoption("--runqueries"):
pytest.skip("need --runqueries option to run")
@pytest.fixture(scope="session")
def data_dir(request):
return request.config.getoption("--data_dir")
@pytest.fixture(scope="session")
def queries_dir(request):
return request.config.getoption("--queries_dir")
================================================
FILE: continuous_integration/docker/cloud.dockerfile
================================================
ARG DOCKER_META_VERSION
FROM nbraun/dask-sql:${DOCKER_META_VERSION}
RUN conda config --add channels conda-forge \
&& /opt/conda/bin/mamba install --freeze-installed -y \
s3fs \
dask-cloudprovider \
&& pip install awscli \
&& conda clean -ay
ENTRYPOINT ["tini", "-g", "--", "/usr/bin/prepare.sh"]
================================================
FILE: continuous_integration/docker/conda.txt
================================================
python>=3.9
dask>=2024.4.1
pandas>=1.4.0
jpype1>=1.0.2
openjdk>=8
maven>=3.6.0
pytest>=6.0.2
pytest-cov>=2.10.1
pytest-xdist
mock>=4.0.3
sphinx>=3.2.1
tzlocal>=2.1
fastapi>=0.92.0
httpx>=0.24.1
uvicorn>=0.14
pyarrow>=14.0.1
prompt_toolkit>=3.0.8
pygments>=2.7.1
scikit-learn>=1.0.0
intake>=0.6.0
pre-commit>=2.11.1
black=22.10.0
isort=5.12.0
maturin>=1.3,<1.4
================================================
FILE: continuous_integration/docker/main.dockerfile
================================================
# Dockerfile for dask-sql running the SQL server
# For more information, see https://dask-sql.readthedocs.io/.
FROM daskdev/dask:latest
LABEL author "Nils Braun "
# Install rustc & gcc for compilation of DataFusion planner
ADD https://sh.rustup.rs /rustup-init.sh
RUN sh /rustup-init.sh -y --default-toolchain=stable --profile=minimal \
&& apt-get update \
&& apt-get install gcc -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install conda dependencies for dask-sql
COPY continuous_integration/docker/conda.txt /opt/dask_sql/
RUN mamba install -y \
# build requirements
"maturin>=1.3,<1.4" \
# core dependencies
"dask>=2024.4.1" \
"pandas>=1.4.0" \
"fastapi>=0.92.0" \
"httpx>=0.24.1" \
"uvicorn>=0.14" \
"tzlocal>=2.1" \
"prompt_toolkit>=3.0.8" \
"pygments>=2.7.1" \
tabulate \
# additional dependencies
"pyarrow>=14.0.1" \
"scikit-learn>=1.0.0" \
"intake>=0.6.0" \
&& conda clean -ay
# install dask-sql
COPY Cargo.toml /opt/dask_sql/
COPY Cargo.lock /opt/dask_sql/
COPY pyproject.toml /opt/dask_sql/
COPY setup.cfg /opt/dask_sql/
COPY README.md /opt/dask_sql/
COPY .git /opt/dask_sql/.git
COPY src /opt/dask_sql/src
COPY dask_sql /opt/dask_sql/dask_sql
RUN cd /opt/dask_sql/ \
&& CONDA_PREFIX="/opt/conda/" maturin develop
# Set the script to execute
COPY continuous_integration/scripts/startup_script.py /opt/dask_sql/startup_script.py
EXPOSE 8080
ENTRYPOINT [ "/usr/bin/prepare.sh", "/opt/conda/bin/python", "/opt/dask_sql/startup_script.py" ]
================================================
FILE: continuous_integration/environment-3.10.yaml
================================================
name: dask-sql
channels:
- conda-forge
dependencies:
- c-compiler
- dask>=2024.4.1
- dask-expr>=1.0.11
- docker-py>=7.1.0
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
- mlflow>=2.10
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.10
- py-xgboost>=2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
- zlib
================================================
FILE: continuous_integration/environment-3.11.yaml
================================================
name: dask-sql
channels:
- conda-forge
dependencies:
- c-compiler
- dask>=2024.4.1
- dask-expr>=1.0.11
- docker-py>=7.1.0
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
- mlflow>=2.10
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.11
- py-xgboost>=2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
- zlib
================================================
FILE: continuous_integration/environment-3.12.yaml
================================================
name: dask-sql
channels:
- conda-forge
dependencies:
- c-compiler
- dask>=2024.4.1
- dask-expr>=1.0.11
- docker-py>=7.1.0
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
# TODO: add once mlflow 3.12 builds are available
# - mlflow>=2.10
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.12
- py-xgboost>=2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
# TODO: add once tpot supports python 3.12
# - tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
- zlib
================================================
FILE: continuous_integration/environment-3.9.yaml
================================================
name: dask-sql-py39
channels:
- conda-forge
dependencies:
- c-compiler
- dask=2024.4.1
- dask-expr=1.0.11
- docker-py>=7.1.0
- fastapi=0.92.0
- fugue=0.7.3
- httpx=0.24.1
- intake=0.6.0
- jsonschema
- lightgbm
- maturin=1.3
- mlflow=2.10
- mock
- numpy=1.22.4
- pandas=2
- pre-commit
- prompt_toolkit=3.0.8
- psycopg2
- pyarrow=14.0.1
- pygments=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.9
- py-xgboost=2.0.3
- scikit-learn=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal=2.1
- uvicorn=0.14
- zlib
================================================
FILE: continuous_integration/gpuci/environment-3.10.yaml
================================================
name: dask-sql
channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- conda-forge
- nvidia
- nodefaults
dependencies:
- c-compiler
- zlib
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.10
- py-xgboost>=2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
# GPU-specific requirements
- cudatoolkit=11.8
- cudf=24.06
- cuml=24.06
- dask-cudf=24.06
- dask-cuda=24.06
- ucx-proc=*=gpu
- ucx-py=0.38
- xgboost=*=rapidsai_py*
- libxgboost=*=rapidsai_h*
================================================
FILE: continuous_integration/gpuci/environment-3.11.yaml
================================================
name: dask-sql
channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- conda-forge
- nvidia
- nodefaults
dependencies:
- c-compiler
- zlib
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.11
- py-xgboost>=2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
# GPU-specific requirements
- cudatoolkit=11.8
- cudf=24.06
- cuml=24.06
- dask-cudf=24.06
- dask-cuda=24.06
- ucx-proc=*=gpu
- ucx-py=0.38
- xgboost=*=rapidsai_py*
- libxgboost=*=rapidsai_h*
================================================
FILE: continuous_integration/gpuci/environment-3.9.yaml
================================================
name: dask-sql
channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- conda-forge
- nvidia
- nodefaults
dependencies:
- c-compiler
- zlib
- dask>=2024.4.1
- dask-expr>=1.0.11
- fastapi>=0.92.0
- fugue>=0.7.3
- httpx>=0.24.1
- intake>=0.6.0
- jsonschema
- lightgbm
- maturin>=1.3,<1.4
- mock
- numpy>=1.22.4
- pandas>=2
- pre-commit
- prompt_toolkit>=3.0.8
- psycopg2
- pyarrow>=14.0.1
- pygments>=2.7.1
- pyhive
- pytest-cov
- pytest-rerunfailures
- pytest-xdist
- pytest
- python=3.9
- py-xgboost==2.0.3
- scikit-learn>=1.0.0
- sphinx
- sqlalchemy
- tpot>=0.12.0
# FIXME: https://github.com/fugue-project/fugue/issues/526
- triad<0.9.2
- tzlocal>=2.1
- uvicorn>=0.14
# GPU-specific requirements
- cudatoolkit=11.8
- cudf=24.06
- cuml=24.06
- dask-cudf=24.06
- dask-cuda=24.06
- ucx-proc=*=gpu
- ucx-py=0.38
- xgboost=*=rapidsai_py*
- libxgboost=*=rapidsai_h*
================================================
FILE: continuous_integration/recipe/build.sh
================================================
#!/bin/bash
set -ex
# See https://github.com/conda-forge/rust-feedstock/blob/master/recipe/build.sh for cc env explanation
if [ "$c_compiler" = gcc ] ; then
case "$target_platform" in
linux-64) rust_env_arch=X86_64_UNKNOWN_LINUX_GNU ;;
linux-aarch64) rust_env_arch=AARCH64_UNKNOWN_LINUX_GNU ;;
linux-ppc64le) rust_env_arch=POWERPC64LE_UNKNOWN_LINUX_GNU ;;
*) echo "unknown target_platform $target_platform" ; exit 1 ;;
esac
export CARGO_TARGET_${rust_env_arch}_LINKER=$CC
fi
declare -a _xtra_maturin_args
mkdir -p $SRC_DIR/.cargo
if [ "$target_platform" = "osx-64" ] ; then
cat <> $SRC_DIR/.cargo/config
[target.x86_64-apple-darwin]
linker = "$CC"
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
EOF
_xtra_maturin_args+=(--target=x86_64-apple-darwin)
elif [ "$target_platform" = "osx-arm64" ] ; then
cat <> $SRC_DIR/.cargo/config
# Required for intermediate codegen stuff
[target.x86_64-apple-darwin]
linker = "$CC_FOR_BUILD"
# Required for final binary artifacts for target
[target.aarch64-apple-darwin]
linker = "$CC"
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
EOF
_xtra_maturin_args+=(--target=aarch64-apple-darwin)
# This variable must be set to the directory containing the target's libpython DSO
export PYO3_CROSS_LIB_DIR=$PREFIX/lib
# xref: https://github.com/PyO3/pyo3/commit/7beb2720
export PYO3_PYTHON_VERSION=${PY_VER}
# xref: https://github.com/conda-forge/python-feedstock/issues/621
sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/os-patch.py
sed -i.bak 's,aarch64,arm64,g' $BUILD_PREFIX/venv/lib/platform-patch.py
fi
maturin build -vv -j "${CPU_COUNT}" --release --strip --manylinux off --interpreter="${PYTHON}" "${_xtra_maturin_args[@]}"
"${PYTHON}" -m pip install $SRC_DIR/target/wheels/dask_sql*.whl --no-deps -vv
================================================
FILE: continuous_integration/recipe/conda_build_config.yaml
================================================
c_compiler:
- gcc
c_compiler_version:
- '12'
rust_compiler:
- rust
rust_compiler_version:
- '1.72'
maturin:
- '1.3'
xz: # [linux64]
- '5' # [linux64]
================================================
FILE: continuous_integration/recipe/meta.yaml
================================================
{% set name = "dask-sql" %}
{% set major_minor_patch = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').split('.') %}
{% set new_patch = major_minor_patch[2] | int + 1 %}
{% set version = (major_minor_patch[:2] + [new_patch]) | join('.') + environ.get('VERSION_SUFFIX', '') %}
package:
name: {{ name|lower }}
version: {{ version }}
source:
git_url: ../..
build:
number: {{ GIT_DESCRIBE_NUMBER }}
entry_points:
- dask-sql-server = dask_sql.server.app:main
- dask-sql = dask_sql.cmd:main
string: py{{ python | replace(".", "") }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
requirements:
build:
- python # [build_platform != target_platform]
- cross-python_{{ target_platform }} # [build_platform != target_platform]
- maturin # [build_platform != target_platform]
- {{ compiler('c') }}
- {{ compiler('rust') }}
host:
- pip
- python
- maturin
- xz # [linux64]
run:
- python
- dask >=2024.4.1
- pandas >=1.4.0
- fastapi >=0.92.0
- httpx >=0.24.1
- uvicorn >=0.14
- tzlocal >=2.1
- prompt-toolkit >=3.0.8
- pygments >=2.7.1
- tabulate
test:
imports:
- dask_sql
commands:
- pip check
- dask-sql-server --help
- dask-sql --help
requires:
- pip
about:
home: https://github.com/dask-contrib/dask-sql/
summary: SQL query layer for Dask
license: MIT
license_file: LICENSE.txt
================================================
FILE: continuous_integration/recipe/run_test.py
================================================
import dask.dataframe as dd
import pandas as pd
from dask_sql import Context
c = Context()
data = """
name,x
Alice,34
Bob,
"""
df = pd.DataFrame({"name": ["Alice", "Bob", "Chris"] * 100, "x": list(range(300))})
ddf = dd.from_pandas(df, npartitions=10)
# This needs to be temprarily disabled since this query requires features that are not yet implemented
# c.create_table("my_data", ddf)
# got = c.sql(
# """
# SELECT
# my_data.name,
# SUM(my_data.x) AS "S"
# FROM
# my_data
# GROUP BY
# my_data.name
# """
# )
# expect = pd.DataFrame({"name": ["Alice", "Bob", "Chris"], "S": [14850, 14950, 15050]})
# dd.assert_eq(got, expect)
================================================
FILE: continuous_integration/scripts/startup_script.py
================================================
from dask_sql.server.app import main
if __name__ == "__main__":
main()
================================================
FILE: continuous_integration/scripts/update-dependencies.sh
================================================
#!/bin/bash
UPDATE_ALL_CARGO_DEPS="${UPDATE_ALL_CARGO_DEPS:-true}"
# Update datafusion dependencies in the dask-planner to the latest revision from the default branch
sed -i -r 's/^datafusion-([a-z]+).*/datafusion-\1 = { git = "https:\/\/github.com\/apache\/arrow-datafusion-python\/" }/g' Cargo.toml
if [ "$UPDATE_ALL_CARGO_DEPS" = true ] ; then
cargo update
fi
================================================
FILE: dask_sql/__init__.py
================================================
# FIXME: can we modify TLS model of Rust object to avoid aarch64 glibc bug?
# https://github.com/dask-contrib/dask-sql/issues/1169
from . import _datafusion_lib # isort:skip
import importlib.metadata
from dask.config import set
from . import config
from .cmd import cmd_loop
from .context import Context
from .datacontainer import Statistics
from .server.app import run_server
# TODO: get pyarrow strings and p2p shuffle working
set(dataframe__convert_string=False, dataframe__shuffle__method="tasks")
__version__ = importlib.metadata.version(__name__)
__all__ = [__version__, cmd_loop, Context, run_server, Statistics]
================================================
FILE: dask_sql/_compat.py
================================================
import prompt_toolkit
from packaging.version import parse as parseVersion
_prompt_toolkit_version = parseVersion(prompt_toolkit.__version__)
# TODO: remove if prompt-toolkit min version gets bumped
PIPE_INPUT_CONTEXT_MANAGER = _prompt_toolkit_version >= parseVersion("3.0.29")
================================================
FILE: dask_sql/cmd.py
================================================
import logging
import os
import sys
import tempfile
import traceback
from argparse import ArgumentParser
from functools import partial
from typing import Union
import pandas as pd
from dask.datasets import timeseries
from dask.distributed import Client, as_completed
from prompt_toolkit.auto_suggest import AutoSuggestFromHistory
from prompt_toolkit.completion import WordCompleter
from prompt_toolkit.history import FileHistory
from prompt_toolkit.shortcuts import ProgressBar
from pygments.lexers.sql import SqlLexer
try:
# prompt_toolkit version >= 2
from prompt_toolkit.lexers import PygmentsLexer
except ImportError: # pragma: no cover
# prompt_toolkit version < 2
from prompt_toolkit.layout.lexers import PygmentsLexer
from dask_sql.context import Context
meta_command_completer = WordCompleter(
["\\l", "\\d?", "\\dt", "\\df", "\\de", "\\dm", "\\conninfo", "quit"]
)
class CompatiblePromptSession:
"""
Session object wrapper for the prompt_toolkit module
In the version jump from 1 to 2, the prompt_toolkit
introduced a PromptSession object.
Some environments however (e.g. google collab)
still rely on an older prompt_toolkit version,
so we try to support both versions
with this wrapper object.
All it does is export a `prompt` function.
"""
def __init__(self, lexer) -> None: # pragma: no cover
# make sure everytime dask-sql uses same history file
kwargs = {
"lexer": lexer,
"history": FileHistory(
os.path.join(tempfile.gettempdir(), "dask-sql-history")
),
"auto_suggest": AutoSuggestFromHistory(),
"completer": meta_command_completer,
}
try:
# Version >= 2.0.1: we can use the session object
from prompt_toolkit import PromptSession
session = PromptSession(**kwargs)
self.prompt = session.prompt
except ImportError:
# Version < 2.0: there is no session object
from prompt_toolkit.shortcuts import prompt
self.prompt = partial(prompt, **kwargs)
def _display_markdown(content, **kwargs):
df = pd.DataFrame(content, **kwargs)
print(df.to_markdown(tablefmt="fancy_grid"))
def _parse_meta_command(sql):
command, _, arg = sql.partition(" ")
return command, arg.strip()
def _meta_commands(sql: str, context: Context, client: Client) -> Union[bool, Client]:
"""
parses metacommands and prints their result
returns True if meta commands detected
"""
cmd, schema_name = _parse_meta_command(sql)
available_commands = [
["\\l", "List schemas"],
["\\d?, help, ?", "Show available commands"],
["\\conninfo", "Show Dask cluster info"],
["\\dt [schema]", "List tables"],
["\\df [schema]", "List functions"],
["\\dm [schema]", "List models"],
["\\de [schema]", "List experiments"],
["\\dss [schema]", "Switch schema"],
["\\dsc [dask scheduler address]", "Switch Dask cluster"],
["quit", "Quits dask-sql-cli"],
]
if cmd == "\\dsc":
# Switch Dask cluster
_, scheduler_address = _parse_meta_command(sql)
client = Client(scheduler_address)
return client # pragma: no cover
schema_name = schema_name or context.schema_name
if cmd == "\\d?" or cmd == "help" or cmd == "?":
_display_markdown(available_commands, columns=["Commands", "Description"])
elif cmd == "\\l":
_display_markdown(context.schema.keys(), columns=["Schemas"])
elif cmd == "\\dt":
_display_markdown(context.schema[schema_name].tables.keys(), columns=["Tables"])
elif cmd == "\\df":
_display_markdown(
context.schema[schema_name].functions.keys(), columns=["Functions"]
)
elif cmd == "\\de":
_display_markdown(
context.schema[schema_name].experiments.keys(), columns=["Experiments"]
)
elif cmd == "\\dm":
_display_markdown(context.schema[schema_name].models.keys(), columns=["Models"])
elif cmd == "\\conninfo":
cluster_info = [
["Dask scheduler", client.scheduler.__dict__["addr"]],
["Dask dashboard", client.dashboard_link],
["Cluster status", client.status],
["Dask workers", len(client.cluster.workers)],
]
_display_markdown(
cluster_info, columns=["components", "value"]
) # pragma: no cover
elif cmd == "\\dss":
if schema_name in context.schema:
context.schema_name = schema_name
else:
print(f"Schema {schema_name} not available")
elif cmd == "quit":
print("Quitting dask-sql ...")
client.close() # for safer side
sys.exit()
elif cmd.startswith("\\"):
print(
f"The meta command {cmd} not available, please use commands from below list"
)
_display_markdown(available_commands, columns=["Commands", "Description"])
else:
# nothing detected probably not a meta command
return False
return True
def cmd_loop(
context: Context = None,
client: Client = None,
startup=False,
log_level=None,
): # pragma: no cover
"""
Run a REPL for answering SQL queries using ``dask-sql``.
Every SQL expression that ``dask-sql`` understands can be used here.
Args:
context (:obj:`dask_sql.Context`): If set, use this context instead of an empty one.
client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
startup (:obj:`bool`): Whether to wait until Apache Calcite was loaded
log_level: (:obj:`str`): The log level of the server and dask-sql
Example:
It is possible to run a REPL by using the CLI script in ``dask-sql``
or by calling this function directly in your user code:
.. code-block:: python
from dask_sql import cmd_loop
# Create your pre-filled context
c = Context()
...
cmd_loop(context=c)
Of course, it is also possible to call the usual ``CREATE TABLE``
commands.
"""
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
logging.basicConfig(level=log_level)
client = client or Client()
context = context or Context()
if startup:
context.sql("SELECT 1 + 1").compute()
session = CompatiblePromptSession(lexer=PygmentsLexer(SqlLexer))
while True:
try:
text = session.prompt("(dask-sql) > ")
except KeyboardInterrupt:
continue
except EOFError:
break
text = text.rstrip(";").strip()
if not text:
continue
meta_command_detected = _meta_commands(text, context=context, client=client)
if isinstance(meta_command_detected, Client):
client = meta_command_detected
if not meta_command_detected:
try:
df = context.sql(text, return_futures=True)
if df is not None: # some sql commands returns None
df = df.persist()
# Now turn it into a list of futures
futures = client.futures_of(df)
with ProgressBar() as pb:
for _ in pb(
as_completed(futures), total=len(futures), label="Executing"
):
continue
df = df.compute()
print(df.to_markdown(tablefmt="fancy_grid"))
except Exception:
traceback.print_exc()
def main(): # pragma: no cover
parser = ArgumentParser()
parser.add_argument(
"--scheduler-address",
default=None,
help="Connect to this dask scheduler if given",
)
parser.add_argument(
"--log-level",
default=None,
help="Set the log level of the server. Defaults to info.",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
)
parser.add_argument(
"--load-test-data",
default=False,
action="store_true",
help="Preload some test data.",
)
parser.add_argument(
"--startup",
default=False,
action="store_true",
help="Wait until Apache Calcite was properly loaded",
)
args = parser.parse_args()
client = None
if args.scheduler_address:
client = Client(args.scheduler_address)
context = Context()
if args.load_test_data:
df = timeseries(freq="1d").reset_index(drop=False)
context.create_table("timeseries", df.persist())
cmd_loop(
context=context, client=client, startup=args.startup, log_level=args.log_level
)
if __name__ == "__main__":
main()
================================================
FILE: dask_sql/config.py
================================================
import os
import dask
import yaml
fn = os.path.join(os.path.dirname(__file__), "sql.yaml")
with open(fn) as f:
defaults = yaml.safe_load(f)
dask.config.update_defaults(defaults)
dask.config.ensure_file(source=fn, comment=True)
================================================
FILE: dask_sql/context.py
================================================
import asyncio
import inspect
import logging
from collections import Counter
from typing import Any, Callable, Union
import dask.dataframe as dd
import pandas as pd
from dask import config as dask_config
from dask.base import optimize
from dask.utils_test import hlg_layer
from dask_sql._datafusion_lib import (
DaskSchema,
DaskSQLContext,
DaskSQLOptimizerConfig,
DaskTable,
DFOptimizationException,
DFParsingException,
LogicalPlan,
)
try:
from dask_sql.physical.utils.statistics import parquet_statistics
except ModuleNotFoundError:
parquet_statistics = None
try:
import dask_cuda # noqa: F401
except ImportError: # pragma: no cover
pass
from dask_sql import input_utils
from dask_sql.datacontainer import (
UDF,
DataContainer,
FunctionDescription,
SchemaContainer,
Statistics,
)
from dask_sql.input_utils import InputType, InputUtil
from dask_sql.integrations.ipython import ipython_integration
from dask_sql.mappings import python_to_sql_type
from dask_sql.physical.rel import RelConverter, custom, logical
from dask_sql.physical.rex import RexConverter, core
from dask_sql.utils import ParsingException
logger = logging.getLogger(__name__)
class Context:
"""
Main object to communicate with ``dask_sql``.
It holds a store of all registered data frames (= tables)
and can convert SQL queries to dask data frames.
The tables in these queries are referenced by the name,
which is given when registering a dask dataframe.
Example:
.. code-block:: python
from dask_sql import Context
c = Context()
# Register a table
c.create_table("my_table", df)
# Now execute an SQL query. The result is a dask dataframe
result = c.sql("SELECT a, b FROM my_table")
# Trigger the computation (or use the data frame for something else)
result.compute()
Usually, you will only ever have a single context in your program.
See also:
:func:`sql`
:func:`create_table`
"""
DEFAULT_CATALOG_NAME = "dask_sql"
DEFAULT_SCHEMA_NAME = "root"
def __init__(self, logging_level=logging.INFO):
"""
Create a new context.
"""
# Set the logging level for this SQL context
logging.basicConfig(level=logging_level)
# Name of the root catalog
self.catalog_name = self.DEFAULT_CATALOG_NAME
# Name of the root schema
self.schema_name = self.DEFAULT_SCHEMA_NAME
# All schema information
self.schema = {self.schema_name: SchemaContainer(self.schema_name)}
# A started SQL server (useful for jupyter notebooks)
self.sql_server = None
# Create the `DaskSQLOptimizerConfig` Rust context
optimizer_config = DaskSQLOptimizerConfig(
dask_config.get("sql.dynamic_partition_pruning"),
dask_config.get("sql.fact_dimension_ratio"),
dask_config.get("sql.max_fact_tables"),
dask_config.get("sql.preserve_user_order"),
dask_config.get("sql.filter_selectivity"),
)
# Create the `DaskSQLContext` Rust context
self.context = DaskSQLContext(
self.catalog_name, self.schema_name, optimizer_config
)
self.context.register_schema(self.schema_name, DaskSchema(self.schema_name))
# # Register any default plugins, if nothing was registered before.
RelConverter.add_plugin_class(logical.DaskAggregatePlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskCrossJoinPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskEmptyRelationPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskFilterPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskJoinPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskLimitPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskProjectPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskSortPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskTableScanPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskUnionPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskValuesPlugin, replace=False)
RelConverter.add_plugin_class(logical.DaskWindowPlugin, replace=False)
RelConverter.add_plugin_class(logical.SamplePlugin, replace=False)
RelConverter.add_plugin_class(logical.ExplainPlugin, replace=False)
RelConverter.add_plugin_class(logical.SubqueryAlias, replace=False)
RelConverter.add_plugin_class(custom.AnalyzeTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateExperimentPlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateCatalogSchemaPlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateMemoryTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.CreateTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.DropModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.DropSchemaPlugin, replace=False)
RelConverter.add_plugin_class(custom.DropTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.ExportModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.PredictModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.ShowColumnsPlugin, replace=False)
RelConverter.add_plugin_class(custom.DescribeModelPlugin, replace=False)
RelConverter.add_plugin_class(custom.ShowModelsPlugin, replace=False)
RelConverter.add_plugin_class(custom.ShowSchemasPlugin, replace=False)
RelConverter.add_plugin_class(custom.ShowTablesPlugin, replace=False)
RelConverter.add_plugin_class(custom.UseSchemaPlugin, replace=False)
RelConverter.add_plugin_class(custom.AlterSchemaPlugin, replace=False)
RelConverter.add_plugin_class(custom.AlterTablePlugin, replace=False)
RelConverter.add_plugin_class(custom.DistributeByPlugin, replace=False)
RexConverter.add_plugin_class(core.RexAliasPlugin, replace=False)
RexConverter.add_plugin_class(core.RexCallPlugin, replace=False)
RexConverter.add_plugin_class(core.RexInputRefPlugin, replace=False)
RexConverter.add_plugin_class(core.RexLiteralPlugin, replace=False)
RexConverter.add_plugin_class(core.RexScalarSubqueryPlugin, replace=False)
InputUtil.add_plugin_class(input_utils.DaskInputPlugin, replace=False)
InputUtil.add_plugin_class(input_utils.PandasLikeInputPlugin, replace=False)
InputUtil.add_plugin_class(input_utils.HiveInputPlugin, replace=False)
InputUtil.add_plugin_class(input_utils.IntakeCatalogInputPlugin, replace=False)
InputUtil.add_plugin_class(input_utils.SqlalchemyHiveInputPlugin, replace=False)
# needs to be the last entry, as it only checks for string
InputUtil.add_plugin_class(input_utils.LocationInputPlugin, replace=False)
def create_table(
self,
table_name: str,
input_table: InputType,
format: str = None,
persist: bool = False,
schema_name: str = None,
statistics: Statistics = None,
gpu: bool = False,
**kwargs,
):
"""
Registering a (dask/pandas) table makes it usable in SQL queries.
The name you give here can be used as table name in the SQL later.
Please note, that the table is stored as it is now.
If you change the table later, you need to re-register.
Instead of passing an already loaded table, it is also possible
to pass a string to a storage location.
The library will then try to load the data using one of
`dask's read methods `_.
If the file format can not be deduced automatically, it is also
possible to specify it via the ``format`` parameter.
Typical file formats are csv or parquet.
Any additional parameters will get passed on to the read method.
Please note that some file formats require additional libraries.
By default, the data will be lazily loaded. If you would like to
load the data directly into memory you can do so by setting
persist=True.
See :ref:`data_input` for more information.
Example:
This code registers a data frame as table "data"
and then uses it in a query.
.. code-block:: python
c.create_table("data", df)
df_result = c.sql("SELECT a, b FROM data")
This code reads a file from disk.
Please note that we assume that the file(s) are reachable under this path
from every node in the cluster
.. code-block:: python
c.create_table("data", "/home/user/data.csv")
df_result = c.sql("SELECT a, b FROM data")
This example reads from a hive table.
.. code-block:: python
from pyhive.hive import connect
cursor = connect("localhost", 10000).cursor()
c.create_table("data", cursor, hive_table_name="the_name_in_hive")
df_result = c.sql("SELECT a, b FROM data")
Args:
table_name: (:obj:`str`): Under which name should the new table be addressable
input_table (:class:`dask.dataframe.DataFrame` or :class:`pandas.DataFrame` or :obj:`str` or :class:`hive.Cursor`):
The data frame/location/hive connection to register.
format (:obj:`str`): Only used when passing a string into the ``input`` parameter.
Specify the file format directly here if it can not be deduced from the extension.
If set to "memory", load the data from a published dataset in the dask cluster.
persist (:obj:`bool`): Only used when passing a string into the ``input`` parameter.
Set to true to turn on loading the file data directly into memory.
schema_name: (:obj:`str`): in which schema to create the table. By default, will use the currently selected schema.
statistics: (:obj:`Statistics`): if given, use these statistics during the cost-based optimization.
gpu: (:obj:`bool`): if set to true, use dask-cudf to run the data frame calculations on your GPU.
Please note that the GPU support is currently not covering all of dask-sql's SQL language.
**kwargs: Additional arguments for specific formats. See :ref:`data_input` for more information.
"""
logger.debug(
f"Creating table: '{table_name}' of format type '{format}' in schema '{schema_name}'"
)
schema_name = schema_name or self.schema_name
dc = InputUtil.to_dc(
input_table,
table_name=table_name,
format=format,
persist=persist,
gpu=gpu,
**kwargs,
)
if type(input_table) == str:
dc.filepath = input_table
self.schema[schema_name].filepaths[table_name.lower()] = input_table
elif hasattr(input_table, "dask") and dd.utils.is_dataframe_like(input_table):
try:
if dd._dask_expr_enabled():
from dask_expr.io.parquet import ReadParquet
dask_filepath = None
operations = input_table.find_operations(ReadParquet)
for op in operations:
dask_filepath = op._args[0]
else:
dask_filepath = hlg_layer(
input_table.dask, "read-parquet"
).creation_info["args"][0]
dc.filepath = dask_filepath
self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
except KeyError:
logger.debug("Expected 'read-parquet' layer")
if parquet_statistics and not dd._dask_expr_enabled() and not statistics:
statistics = parquet_statistics(dc.df)
if statistics:
row_count = 0
for d in statistics:
row_count += d["num-rows"]
statistics = Statistics(row_count)
if not statistics:
statistics = Statistics(float("nan"))
dc.statistics = statistics
self.schema[schema_name].tables[table_name.lower()] = dc
self.schema[schema_name].statistics[table_name.lower()] = statistics
def drop_table(self, table_name: str, schema_name: str = None):
"""
Remove a table with the given name from the registered tables.
This will also delete the dataframe.
Args:
table_name: (:obj:`str`): Which table to remove.
"""
schema_name = schema_name or self.schema_name
del self.schema[schema_name].tables[table_name]
def drop_schema(self, schema_name: str):
"""
Remove a schema with the given name from the registered schemas.
This will also delete all tables, functions etc.
Args:
schema_name: (:obj:`str`): Which schema to remove.
"""
if schema_name == self.DEFAULT_SCHEMA_NAME:
raise RuntimeError(f"Default Schema `{schema_name}` cannot be deleted")
del self.schema[schema_name]
if self.schema_name == schema_name:
self.schema_name = self.DEFAULT_SCHEMA_NAME
def register_function(
self,
f: Callable,
name: str,
parameters: list[tuple[str, type]],
return_type: type,
replace: bool = False,
schema_name: str = None,
row_udf: bool = False,
):
"""
Register a custom function with the given name.
The function can be used (with this name)
in every SQL queries from now on - but only for scalar operations
(no aggregations).
This means, if you register a function "f", you can now call
.. code-block:: sql
SELECT f(x)
FROM df
Please keep in mind that you can only have one function with the same name,
regardless of whether it is an aggregation or a scalar function. By default,
attempting to register two functions with the same name will raise an error;
setting `replace=True` will give precedence to the most recently registered
function.
For the registration, you need to supply both the
list of parameter and parameter types as well as the
return type. Use `numpy dtypes `_ if possible.
More information: :ref:`custom`
Example:
This example registers a function "f", which
calculates the square of an integer and applies
it to the column ``x``.
.. code-block:: python
def f(x):
return x ** 2
c.register_function(f, "f", [("x", np.int64)], np.int64)
sql = "SELECT f(x) FROM df"
df_result = c.sql(sql)
Example of overwriting two functions with the same name:
This example registers a different function "f", which
calculates the floor division of an integer and applies
it to the column ``x``. It also shows how to overwrite
the previous function with the replace parameter.
.. code-block:: python
def f(x):
return x // 2
c.register_function(f, "f", [("x", np.int64)], np.int64, replace=True)
sql = "SELECT f(x) FROM df"
df_result = c.sql(sql)
Args:
f (:obj:`Callable`): The function to register
name (:obj:`str`): Under which name should the new function be addressable in SQL
parameters (:obj:`List[Tuple[str, type]]`): A list ot tuples of parameter name and parameter type.
Use `numpy dtypes `_ if possible. This
function is sensitive to the order of specified parameters when `row_udf=True`, and it is assumed
that column arguments are specified in order, followed by scalar arguments.
return_type (:obj:`type`): The return type of the function
replace (:obj:`bool`): If `True`, do not raise an error if a function with the same name is already
present; instead, replace the original function. Default is `False`.
See also:
:func:`register_aggregation`
"""
self._register_callable(
f,
name,
aggregation=False,
parameters=parameters,
return_type=return_type,
replace=replace,
schema_name=schema_name,
row_udf=row_udf,
)
def register_aggregation(
self,
f: dd.Aggregation,
name: str,
parameters: list[tuple[str, type]],
return_type: type,
replace: bool = False,
schema_name: str = None,
):
"""
Register a custom aggregation with the given name.
The aggregation can be used (with this name)
in every SQL queries from now on - but only for aggregation operations
(no scalar function calls).
This means, if you register a aggregation "fagg", you can now call
.. code-block:: sql
SELECT fagg(y)
FROM df
GROUP BY x
Please note that you can always only have one function with the same name;
no matter if it is an aggregation or scalar function.
For the registration, you need to supply both the
list of parameter and parameter types as well as the
return type. Use `numpy dtypes `_ if possible.
More information: :ref:`custom`
Example:
The following code registers a new aggregation "fagg", which
computes the sum of a column and uses it on the ``y`` column.
.. code-block:: python
fagg = dd.Aggregation("fagg", lambda x: x.sum(), lambda x: x.sum())
c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)
sql = "SELECT fagg(y) FROM df GROUP BY x"
df_result = c.sql(sql)
Args:
f (:class:`dask.dataframe.Aggregate`): The aggregate to register. See
`the dask documentation `_
for more information.
name (:obj:`str`): Under which name should the new aggregate be addressable in SQL
parameters (:obj:`List[Tuple[str, type]]`): A list ot tuples of parameter name and parameter type.
Use `numpy dtypes `_ if possible.
return_type (:obj:`type`): The return type of the function
replace (:obj:`bool`): Do not raise an error if the function is already present
See also:
:func:`register_function`
"""
self._register_callable(
f,
name,
aggregation=True,
parameters=parameters,
return_type=return_type,
replace=replace,
schema_name=schema_name,
)
def sql(
self,
sql: Any,
return_futures: bool = True,
dataframes: dict[str, Union[dd.DataFrame, pd.DataFrame]] = None,
gpu: bool = False,
config_options: dict[str, Any] = None,
) -> Union[dd.DataFrame, pd.DataFrame]:
"""
Query the registered tables with the given SQL.
The SQL follows approximately the postgreSQL standard - however, not all
operations are already implemented.
In general, only select statements (no data manipulation) works.
For more information, see :ref:`sql`.
Example:
In this example, a query is called
using the registered tables and then
executed using dask.
.. code-block:: python
result = c.sql("SELECT a, b FROM my_table")
print(result.compute())
Args:
sql (:obj:`str`): The query string to execute
return_futures (:obj:`bool`): Return the unexecuted dask dataframe or the data itself.
Defaults to returning the dask dataframe.
dataframes (:obj:`Dict[str, dask.dataframe.DataFrame]`): additional Dask or pandas dataframes
to register before executing this query
gpu (:obj:`bool`): Whether or not to load the additional Dask or pandas dataframes (if any) on GPU;
requires cuDF / dask-cuDF if enabled. Defaults to False.
config_options (:obj:`Dict[str,Any]`): Specific configuration options to pass during
query execution
Returns:
:obj:`dask.dataframe.DataFrame`: the created data frame of this query.
"""
with dask_config.set(config_options):
if dataframes is not None:
for df_name, df in dataframes.items():
self.create_table(df_name, df, gpu=gpu)
if isinstance(sql, str):
rel, _ = self._get_ral(sql)
elif isinstance(sql, LogicalPlan):
rel = sql
else:
raise RuntimeError(
f"Encountered unsupported `LogicalPlan` sql type: {type(sql)}"
)
return self._compute_table_from_rel(rel, return_futures)
def explain(
self,
sql: str,
dataframes: dict[str, Union[dd.DataFrame, pd.DataFrame]] = None,
gpu: bool = False,
) -> str:
"""
Return the stringified relational algebra that this query will produce
once triggered (with ``sql()``).
Helpful to understand the inner workings of dask-sql, but typically not
needed to query your data.
If the query is of DDL type (e.g. CREATE TABLE or DESCRIBE SCHEMA),
no relational algebra plan is created and therefore nothing returned.
Args:
sql (:obj:`str`): The query string to use
dataframes (:obj:`Dict[str, dask.dataframe.DataFrame]`): additional Dask or pandas dataframes
to register before executing this query
gpu (:obj:`bool`): Whether or not to load the additional Dask or pandas dataframes (if any) on GPU;
requires cuDF / dask-cuDF if enabled. Defaults to False.
Returns:
:obj:`str`: a description of the created relational algebra.
"""
dynamic_partition_pruning = dask_config.get("sql.dynamic_partition_pruning")
if not dask_config.get("sql.optimizer.verbose"):
dask_config.set({"sql.dynamic_partition_pruning": False})
if dataframes is not None:
for df_name, df in dataframes.items():
self.create_table(df_name, df, gpu=gpu)
_, rel_string = self._get_ral(sql)
dask_config.set({"sql.dynamic_partition_pruning": dynamic_partition_pruning})
return rel_string
def visualize(self, sql: str, filename="mydask.png") -> None: # pragma: no cover
"""Visualize the computation of the given SQL into the png"""
result = self.sql(sql, return_futures=True)
(result,) = optimize(result)
result.visualize(filename)
def create_schema(self, schema_name: str):
"""
Create a new schema in the database.
Args:
schema_name (:obj:`str`): The name of the schema to create
"""
self.schema[schema_name] = SchemaContainer(schema_name)
def alter_schema(self, old_schema_name, new_schema_name):
"""
Alter schema
Args:
old_schema_name:
new_schema_name:
"""
self.schema[new_schema_name] = self.schema.pop(old_schema_name)
def alter_table(self, old_table_name, new_table_name, schema_name=None):
"""
Alter Table
Args:
old_table_name:
new_table_name:
schema_name:
"""
if schema_name is None:
schema_name = self.schema_name
self.schema[schema_name].tables[new_table_name] = self.schema[
schema_name
].tables.pop(old_table_name)
def register_experiment(
self,
experiment_name: str,
experiment_results: pd.DataFrame,
schema_name: str = None,
):
schema_name = schema_name or self.schema_name
self.schema[schema_name].experiments[
experiment_name.lower()
] = experiment_results
def register_model(
self,
model_name: str,
model: Any,
training_columns: list[str],
schema_name: str = None,
):
"""
Add a model to the model registry.
A model can be anything which has a `.predict` function that transforms
a Dask dataframe into predicted labels (as a Dask series).
After model registration, the model can be used in calls to
`SELECT ... FROM PREDICT` with the given name.
Instead of creating your own model and register it, you can also
train a model directly in dask-sql. See the SQL command `CrEATE MODEL`.
Args:
model_name (:obj:`str`): The name of the model
model: The model to store
training_columns: (list of str): The names of the columns which were
used during the training.
"""
schema_name = schema_name or self.schema_name
self.schema[schema_name].models[model_name.lower()] = (model, training_columns)
def ipython_magic(
self, auto_include=False, disable_highlighting=True
): # pragma: no cover
"""
Register a new ipython/jupyter magic function "sql"
which sends its input as string to the :func:`sql` function.
After calling this magic function in a Jupyter notebook or
an IPython shell, you can write
.. code-block:: python
%sql SELECT * from data
or
.. code-block:: python
%%sql
SELECT * from data
instead of
.. code-block:: python
c.sql("SELECT * from data")
Args:
auto_include (:obj:`bool`): If set to true, automatically
create a table for every pandas or Dask dataframe in the calling
context. That means, if you define a dataframe in your jupyter
notebook you can use it with the same name in your sql call.
Use this setting with care as any defined dataframe can
easily override tables created via `CREATE TABLE`.
.. code-block:: python
df = ...
# Later, without any calls to create_table
%%sql
SELECT * FROM df
disable_highlighting (:obj:`bool`): If set to true, automatically
disable syntax highlighting. If you are working in jupyter lab,
diable_highlighting must be set to true to enable ipython_magic
functionality. If you are working in a classic jupyter notebook,
you may set disable_highlighting=False if desired.
"""
ipython_integration(
self, auto_include=auto_include, disable_highlighting=disable_highlighting
)
def run_server(self, **kwargs): # pragma: no cover
"""
Run a HTTP server for answering SQL queries using ``dask-sql``.
See :ref:`server` for more information.
Args:
client (:obj:`dask.distributed.Client`): If set, use this dask client instead of a new one.
host (:obj:`str`): The host interface to listen on (defaults to all interfaces)
port (:obj:`int`): The port to listen on (defaults to 8080)
log_level: (:obj:`str`): The log level of the server and dask-sql
"""
from dask_sql.server.app import run_server
self.stop_server()
self.server = run_server(**kwargs)
def stop_server(self): # pragma: no cover
"""
Stop a SQL server started by ``run_server``.
"""
if self.sql_server is not None:
loop = asyncio.get_event_loop()
assert loop
loop.create_task(self.sql_server.shutdown())
self.sql_server = None
def fqn(self, tbl: "DaskTable") -> tuple[str, str]:
"""
Return the fully qualified name of an object, maybe including the schema name.
Args:
tbl (:obj:`DaskTable`): The Rust DaskTable instance of the view or table.
Returns:
:obj:`tuple` of :obj:`str`: The fully qualified name of the object
"""
schema_name, table_name = tbl.getSchema(), tbl.getTableName()
if schema_name is None or schema_name == "":
schema_name = self.schema_name
return schema_name, table_name
def _prepare_schemas(self):
"""
Create a list of schemas filled with the dataframes
and functions we have currently in our schema list
"""
logger.debug(
f"There are {len(self.schema)} existing schema(s): {self.schema.keys()}"
)
schema_list = []
for schema_name, schema in self.schema.items():
logger.debug(f"Preparing Schema: '{schema_name}'")
rust_schema = DaskSchema(schema_name)
if not schema.tables:
logger.warning("No tables are registered.")
for name, dc in schema.tables.items():
row_count = (
float(schema.statistics[name].row_count)
if name in schema.statistics
else float(0)
)
filepath = schema.filepaths[name] if name in schema.filepaths else None
df = dc.df
columns = df.columns
cc = dc.column_container
if not dask_config.get("sql.identifier.case_sensitive"):
columns = [col.lower() for col in columns]
cc = cc.rename_handle_duplicates(df.columns, columns)
dc.column_container = cc
column_type_mapping = list(
zip(columns, map(python_to_sql_type, df.dtypes))
)
table = DaskTable(
schema_name, name, row_count, column_type_mapping, filepath
)
rust_schema.add_table(table)
if not schema.functions:
logger.debug("No custom functions defined.")
for function_description in schema.function_lists:
name = function_description.name
sql_return_type = function_description.return_type
sql_parameters = function_description.parameters
if function_description.aggregation:
logger.debug(f"Adding function '{name}' to schema as aggregation.")
rust_schema.add_or_overload_function(
name,
[param[1].getDataType() for param in sql_parameters],
sql_return_type.getDataType(),
True,
)
else:
logger.debug(
f"Adding function '{name}' to schema as scalar function."
)
rust_schema.add_or_overload_function(
name,
[param[1].getDataType() for param in sql_parameters],
sql_return_type.getDataType(),
False,
)
schema_list.append(rust_schema)
return schema_list
def _get_ral(self, sql):
"""Helper function to turn the sql query into a relational algebra and resulting column names"""
logger.debug(f"Entering _get_ral('{sql}')")
optimizer_config = DaskSQLOptimizerConfig(
dask_config.get("sql.dynamic_partition_pruning"),
dask_config.get("sql.fact_dimension_ratio"),
dask_config.get("sql.max_fact_tables"),
dask_config.get("sql.preserve_user_order"),
dask_config.get("sql.filter_selectivity"),
)
self.context.set_optimizer_config(optimizer_config)
# get the schema of what we currently have registered
schemas = self._prepare_schemas()
for schema in schemas:
self.context.register_schema(schema.name, schema)
try:
sqlTree = self.context.parse_sql(sql)
except DFParsingException as pe:
raise ParsingException(sql, str(pe))
logger.debug(f"_get_ral -> sqlTree: {sqlTree}")
rel = sqlTree
# TODO: Need to understand if this list here is actually needed? For now just use the first entry.
if len(sqlTree) > 1:
raise RuntimeError(
f"Multiple 'Statements' encountered for SQL {sql}. Please share this with the dev team!"
)
try:
nonOptimizedRel = self.context.logical_relational_algebra(sqlTree[0])
except DFParsingException as pe:
raise ParsingException(sql, str(pe)) from None
# Optimize the `LogicalPlan` or skip if configured
if dask_config.get("sql.optimize"):
try:
rel = self.context.run_preoptimizer(nonOptimizedRel)
rel = self.context.optimize_relational_algebra(rel)
except DFOptimizationException as oe:
# Use original plan and warn about inability to optimize plan
rel = nonOptimizedRel
logger.warning(str(oe))
else:
rel = nonOptimizedRel
rel_string = rel.explain_original()
logger.debug(f"_get_ral -> LogicalPlan: {rel}")
logger.debug(f"Extracted relational algebra:\n {rel_string}")
return rel, rel_string
def _compute_table_from_rel(self, rel: "LogicalPlan", return_futures: bool = True):
dc = RelConverter.convert(rel, context=self)
if rel.get_current_node_type() == "Explain":
return dc
if dc is None:
return
# Optimization might remove some alias projects. Make sure to keep them here.
select_names = [field for field in rel.getRowType().getFieldList()]
if select_names:
cc = dc.column_container
select_names = select_names[: len(cc.columns)]
# Use FQ name if not unique and simple name if it is unique. If a join contains the same column
# names the output col is prepended with the fully qualified column name
field_counts = Counter([field.getName() for field in select_names])
select_names = [
field.getQualifiedName()
if field_counts[field.getName()] > 1
else field.getName()
for field in select_names
]
cc = cc.rename(
{
df_col: select_name
for df_col, select_name in zip(cc.columns, select_names)
}
)
dc = DataContainer(dc.df, cc)
df = dc.assign()
if not return_futures:
df = df.compute()
return df
def _get_tables_from_stack(self):
"""Helper function to return all dask/pandas dataframes from the calling stack"""
stack = inspect.stack()
tables = {}
# Traverse the stacks from inside to outside
for frame_info in stack:
for var_name, variable in frame_info.frame.f_locals.items():
if var_name.startswith("_"):
continue
if not dd.utils.is_dataframe_like(variable):
continue
# only set them if not defined in an inner context
tables[var_name] = tables.get(var_name, variable)
return tables
def _register_callable(
self,
f: Any,
name: str,
aggregation: bool,
parameters: list[tuple[str, type]],
return_type: type,
replace: bool = False,
schema_name=None,
row_udf: bool = False,
):
"""Helper function to do the function or aggregation registration"""
schema_name = schema_name or self.schema_name
schema = self.schema[schema_name]
# validate and cache UDF metadata
sql_parameters = [
(name, python_to_sql_type(param_type)) for name, param_type in parameters
]
sql_return_type = python_to_sql_type(return_type)
if not aggregation:
f = UDF(f, row_udf, parameters, return_type)
lower_name = name.lower()
if lower_name in schema.functions:
if replace:
schema.function_lists = list(
filter(
lambda f: f.name.lower() != lower_name,
schema.function_lists,
)
)
del schema.functions[lower_name]
elif schema.functions[lower_name] != f:
raise ValueError(
"Registering multiple functions with the same name is only permitted if replace=True"
)
schema.function_lists.append(
FunctionDescription(
name.upper(), sql_parameters, sql_return_type, aggregation
)
)
schema.function_lists.append(
FunctionDescription(
name.lower(), sql_parameters, sql_return_type, aggregation
)
)
schema.functions[lower_name] = f
================================================
FILE: dask_sql/datacontainer.py
================================================
from collections import namedtuple
from typing import Any, Union
import dask.dataframe as dd
import pandas as pd
ColumnType = Union[str, int]
FunctionDescription = namedtuple(
"FunctionDescription", ["name", "parameters", "return_type", "aggregation"]
)
class ColumnContainer:
# Forward declaration
pass
class ColumnContainer:
"""
Helper class to store a list of columns,
which do not necessarily be the ones of the dask dataframe.
Instead, the container also stores a mapping from "frontend"
columns (columns with the names and order expected by SQL)
to "backend" columns (the real column names used by dask)
to prevent unnecessary renames.
"""
def __init__(
self,
frontend_columns: list[str],
frontend_backend_mapping: Union[dict[str, ColumnType], None] = None,
):
assert all(
isinstance(col, str) for col in frontend_columns
), "All frontend columns need to be of string type"
self._frontend_columns = list(frontend_columns)
if frontend_backend_mapping is None:
self._frontend_backend_mapping = {
col: col for col in self._frontend_columns
}
else:
self._frontend_backend_mapping = frontend_backend_mapping
def _copy(self) -> ColumnContainer:
"""
Internal function to copy this container
"""
return ColumnContainer(
self._frontend_columns.copy(), self._frontend_backend_mapping.copy()
)
def limit_to(self, fields: list[str]) -> ColumnContainer:
"""
Create a new ColumnContainer, which has frontend columns
limited to only the ones given as parameter.
Also uses the order of these as the new column order.
"""
if not fields:
return self # pragma: no cover
assert all(f in self._frontend_backend_mapping for f in fields)
cc = self._copy()
cc._frontend_columns = [str(x) for x in fields]
return cc
def rename(self, columns: dict[str, str]) -> ColumnContainer:
"""
Return a new ColumnContainer where the frontend columns
are renamed according to the given mapping.
Columns not present in the mapping are not touched,
the order is preserved.
"""
cc = self._copy()
for column_from, column_to in columns.items():
backend_column = self._frontend_backend_mapping[str(column_from)]
cc._frontend_backend_mapping[str(column_to)] = backend_column
cc._frontend_columns = [
str(columns[col]) if col in columns else col
for col in self._frontend_columns
]
return cc
def rename_handle_duplicates(
self, from_columns: list[str], to_columns: list[str]
) -> ColumnContainer:
"""
Same as `rename` but additionally handles presence of
duplicates in `from_columns`
"""
cc = self._copy()
cc._frontend_backend_mapping.update(
{
str(column_to): self._frontend_backend_mapping[str(column_from)]
for column_from, column_to in zip(from_columns, to_columns)
}
)
columns = dict(zip(from_columns, to_columns))
cc._frontend_columns = [
str(columns.get(col, col)) for col in self._frontend_columns
]
return cc
def mapping(self) -> list[tuple[str, ColumnType]]:
"""
The mapping from frontend columns to backend columns.
"""
return list(self._frontend_backend_mapping.items())
@property
def columns(self) -> list[str]:
"""
The stored frontend columns in the correct order
"""
return self._frontend_columns.copy()
def add(
self, frontend_column: str, backend_column: Union[str, None] = None
) -> ColumnContainer:
"""
Return a new ColumnContainer with the
given column added.
The column is added at the last position in the column list.
"""
cc = self._copy()
frontend_column = str(frontend_column)
cc._frontend_backend_mapping[frontend_column] = str(
backend_column or frontend_column
)
if frontend_column not in cc._frontend_columns:
cc._frontend_columns.append(frontend_column)
return cc
def get_backend_by_frontend_index(self, index: int) -> str:
"""
Get back the dask column, which is referenced by the
frontend (SQL) column with the given index.
"""
frontend_column = self._frontend_columns[index]
backend_column = self._frontend_backend_mapping[frontend_column]
return backend_column
def get_backend_by_frontend_name(self, column: str) -> str:
"""
Get back the dask column, which is referenced by the
frontend (SQL) column with the given name.
"""
try:
return self._frontend_backend_mapping[column]
except KeyError:
return column
def make_unique(self, prefix="col"):
"""
Make sure we have unique column names by calling each column
_
where is the column index.
"""
return self.rename(
columns={str(col): f"{prefix}_{i}" for i, col in enumerate(self.columns)}
)
class Statistics:
"""
Statistics are used during the cost-based optimization.
Currently, only the row count is supported, more
properties might follow. It needs to be provided by the user.
"""
def __init__(self, row_count: int) -> None:
self.row_count = row_count
def __eq__(self, other):
if isinstance(other, Statistics):
return self.row_count == other.row_count
return False
class DataContainer:
"""
In SQL, every column operation or reference is done via
the column index. Some dask operations, such as grouping,
joining or concatenating preserve the columns in a different
order than SQL would expect.
However, we do not want to change the column data itself
all the time (because this would lead to computational overhead),
but still would like to keep the columns accessible by name and index.
For this, we add an additional `ColumnContainer` to each dataframe,
which does all the column mapping between "frontend"
(what SQL expects, also in the correct order)
and "backend" (what dask has).
"""
def __init__(
self,
df: dd.DataFrame,
column_container: ColumnContainer,
statistics: Statistics = None,
filepath: str = None,
):
self.df = df
self.column_container = column_container
self.statistics = statistics
self.filepath = filepath
def assign(self) -> dd.DataFrame:
"""
Combine the column mapping with the actual data and return
a dataframe which has the the columns specified in the
stored ColumnContainer.
"""
df = self.df[
[
self.column_container._frontend_backend_mapping[out_col]
for out_col in self.column_container.columns
]
]
df.columns = self.column_container.columns
return df
class UDF:
def __init__(self, func, row_udf: bool, params, return_type=None):
"""
Helper class that handles different types of UDFs and manages
how they should be mapped to dask operations. Two versions of
UDFs are supported - when `row_udf=False`, the UDF is treated
as expecting series-like objects as arguments and will simply
run those through the function. When `row_udf=True` a row udf
is expected and should be written to expect a dictlike object
containing scalars
"""
self.row_udf = row_udf
self.func = func
self.names = [param[0] for param in params]
self.meta = (None, return_type)
def __call__(self, *args, **kwargs):
if self.row_udf:
column_args = []
scalar_args = []
for operand in args:
if isinstance(operand, dd.Series):
column_args.append(operand)
else:
scalar_args.append(operand)
df = column_args[0].to_frame(self.names[0])
for name, col in zip(self.names[1:], column_args[1:]):
df[name] = col
result = df.apply(
self.func, axis=1, args=tuple(scalar_args), meta=self.meta
).astype(self.meta[1])
else:
result = self.func(*args, **kwargs)
return result
def __eq__(self, other):
if isinstance(other, UDF):
return self.func == other.func and self.row_udf == other.row_udf
return NotImplemented
def __hash__(self):
return (self.func, self.row_udf).__hash__()
class SchemaContainer:
def __init__(self, name: str):
self.__name__ = name
self.tables: dict[str, DataContainer] = {}
self.statistics: dict[str, Statistics] = {}
self.experiments: dict[str, pd.DataFrame] = {}
self.models: dict[str, tuple[Any, list[str]]] = {}
self.functions: dict[str, UDF] = {}
self.function_lists: list[FunctionDescription] = []
self.filepaths: dict[str, str] = {}
================================================
FILE: dask_sql/input_utils/__init__.py
================================================
from .convert import InputType, InputUtil
from .dask import DaskInputPlugin
from .hive import HiveInputPlugin
from .intake import IntakeCatalogInputPlugin
from .location import LocationInputPlugin
from .pandaslike import PandasLikeInputPlugin
from .sqlalchemy import SqlalchemyHiveInputPlugin
__all__ = [
InputUtil,
InputType,
DaskInputPlugin,
HiveInputPlugin,
IntakeCatalogInputPlugin,
LocationInputPlugin,
PandasLikeInputPlugin,
SqlalchemyHiveInputPlugin,
]
================================================
FILE: dask_sql/input_utils/base.py
================================================
from typing import Any
class BaseInputPlugin:
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
raise NotImplementedError
def to_dc(self, input_item: Any, table_name: str, format: str = None, **kwargs):
raise NotImplementedError
================================================
FILE: dask_sql/input_utils/convert.py
================================================
import logging
from typing import TYPE_CHECKING, Union
import dask.dataframe as dd
import pandas as pd
from dask_sql.datacontainer import ColumnContainer, DataContainer
from dask_sql.input_utils.base import BaseInputPlugin
from dask_sql.utils import Pluggable
if TYPE_CHECKING:
import cudf
import hive
import sqlalchemy
logger = logging.Logger(__name__)
InputType = Union[
dd.DataFrame,
pd.DataFrame,
str,
Union[
"sqlalchemy.engine.base.Connection",
"hive.Cursor",
"cudf.core.dataframe.DataFrame",
],
]
class InputUtil(Pluggable):
"""
Plugin list and helper class for transforming the inputs to
create table into a dask dataframe
"""
@classmethod
def add_plugin_class(cls, plugin_class: BaseInputPlugin, replace=True):
"""Convenience function to add a class directly to the plugins"""
logger.debug(f"Registering Input plugin for {plugin_class}")
cls.add_plugin(str(plugin_class), plugin_class(), replace=replace)
@classmethod
def to_dc(
cls,
input_item: InputType,
table_name: str,
format: str = None,
persist: bool = True,
gpu: bool = False,
**kwargs,
) -> DataContainer:
"""
Turn possible input descriptions or formats (e.g. dask dataframes, pandas dataframes,
locations as string, hive tables) into the loaded data containers,
maybe persist them to cluster memory before.
"""
filled_get_dask_dataframe = lambda *args: cls._get_dask_dataframe(
*args,
table_name=table_name,
format=format,
gpu=gpu,
**kwargs,
)
if isinstance(input_item, list):
table = dd.concat([filled_get_dask_dataframe(item) for item in input_item])
else:
table = filled_get_dask_dataframe(input_item)
if persist:
table = table.persist()
return DataContainer(table.copy(), ColumnContainer(table.columns))
@classmethod
def _get_dask_dataframe(
cls,
input_item: InputType,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs,
):
plugin_list = cls.get_plugins()
for plugin in plugin_list:
if plugin.is_correct_input(
input_item, table_name=table_name, format=format, **kwargs
):
return plugin.to_dc(
input_item, table_name=table_name, format=format, gpu=gpu, **kwargs
)
raise ValueError(f"Do not understand the input type {type(input_item)}")
================================================
FILE: dask_sql/input_utils/dask.py
================================================
from typing import Any
import dask.dataframe as dd
from dask_sql.input_utils.base import BaseInputPlugin
class DaskInputPlugin(BaseInputPlugin):
"""Input Plugin for Dask DataFrames, just keeping them"""
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
return isinstance(input_item, dd.DataFrame) or format == "dask"
def to_dc(
self,
input_item: Any,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs
):
if gpu: # pragma: no cover
try:
import dask_cudf # noqa: F401
except ImportError:
raise ModuleNotFoundError(
"Setting `gpu=True` for table creation requires dask_cudf"
)
return input_item.to_backend("cudf", **kwargs)
return input_item
================================================
FILE: dask_sql/input_utils/hive.py
================================================
import ast
import logging
import os
from functools import partial
from typing import Any, Union
import dask.dataframe as dd
from dask_sql._datafusion_lib import SqlTypeName
try:
from pyhive import hive
except ImportError: # pragma: no cover
hive = None
try:
import sqlalchemy
except ImportError: # pragma: no cover
sqlalchemy = None
from dask_sql.input_utils.base import BaseInputPlugin
from dask_sql.mappings import cast_column_type, sql_to_python_type
logger = logging.Logger(__name__)
class HiveInputPlugin(BaseInputPlugin):
"""Input Plugin from Hive"""
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
is_hive_cursor = hive and isinstance(input_item, hive.Cursor)
return self.is_sqlalchemy_hive(input_item) or is_hive_cursor or format == "hive"
def is_sqlalchemy_hive(self, input_item: Any):
return sqlalchemy and isinstance(input_item, sqlalchemy.engine.base.Connection)
def to_dc(
self,
input_item: Any,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs,
):
if gpu: # pragma: no cover
raise Exception("Hive does not support gpu")
table_name = kwargs.pop("hive_table_name", table_name)
schema = kwargs.pop("hive_schema_name", "default")
parsed = self._parse_hive_table_description(input_item, schema, table_name)
(
column_information,
table_information,
storage_information,
partition_information,
) = parsed
logger.debug("Extracted hive information: ")
logger.debug(f"column information: {column_information}")
logger.debug(f"table information: {table_information}")
logger.debug(f"storage information: {storage_information}")
logger.debug(f"partition information: {partition_information}")
# Convert column information
column_information = {
col: sql_to_python_type(SqlTypeName.fromString(col_type.upper()))
for col, col_type in column_information.items()
}
# Extract format information
if "InputFormat" in storage_information:
format = storage_information["InputFormat"].split(".")[-1]
# databricks format is different, see https://github.com/dask-contrib/dask-sql/issues/83
elif "InputFormat" in table_information: # pragma: no cover
format = table_information["InputFormat"].split(".")[-1]
else: # pragma: no cover
raise RuntimeError(
"Do not understand the output of 'DESCRIBE FORMATTED
'"
)
if (
format == "TextInputFormat" or format == "SequenceFileInputFormat"
): # pragma: no cover
storage_description = storage_information.get("Storage Desc Params", {})
read_function = partial(
dd.read_csv,
sep=storage_description.get("field.delim", ","),
header=None,
)
elif format == "ParquetInputFormat" or format == "MapredParquetInputFormat":
read_function = dd.read_parquet
elif format == "OrcInputFormat": # pragma: no cover
read_function = dd.read_orc
elif format == "JsonInputFormat": # pragma: no cover
read_function = dd.read_json
else: # pragma: no cover
raise AttributeError(f"Do not understand hive's table format {format}")
def _normalize(loc):
if loc.startswith("dbfs:/") and not loc.startswith(
"dbfs://"
): # pragma: no cover
# dask (or better: fsspec) needs to have the URL in a specific form
# starting with two // after the protocol
loc = f"dbfs://{loc.lstrip('dbfs:')}"
# file:// is not a known protocol
loc = loc.lstrip("file:")
# Only allow files which do not start with . or _
# Especially, not allow the _SUCCESS files
return os.path.join(loc, "[A-Za-z0-9-]*")
def wrapped_read_function(location, column_information, **kwargs):
location = _normalize(location)
logger.debug(f"Reading in hive data from {location}")
if format == "ParquetInputFormat" or format == "MapredParquetInputFormat":
# Hack needed for parquet files.
# If the folder structure is like .../col=3/...
# parquet wants to read in the partition information.
# However, we add the partition information by ourself
# which will lead to problems afterwards
# Therefore tell parquet to only read in the columns
# we actually care right now
kwargs.setdefault("columns", list(column_information.keys()))
else: # pragma: no cover
# prevent python to optimize it away and make coverage not respect the
# pragma
dummy = 0 # noqa: F841
df = read_function(location, **kwargs)
logger.debug(f"Applying column information: {column_information}")
df = df.rename(columns=dict(zip(df.columns, column_information.keys())))
for col, expected_type in column_information.items():
df = cast_column_type(df, col, expected_type)
return df
if partition_information:
partition_list = self._parse_hive_partition_description(
input_item, schema, table_name
)
logger.debug(f"Reading in partitions from {partition_list}")
tables = []
for partition in partition_list:
parsed = self._parse_hive_table_description(
input_item, schema, table_name, partition=partition
)
(
partition_column_information,
partition_table_information,
_,
_,
) = parsed
location = partition_table_information["Location"]
table = wrapped_read_function(
location, partition_column_information, **kwargs
)
# Now add the additional partition columns
partition_values = ast.literal_eval(
partition_table_information["Partition Value"]
)
# multiple partition column values returned comma separated string
if "," in partition_values:
partition_values = [x.strip() for x in partition_values.split(",")]
logger.debug(
f"Applying additional partition information as columns: {partition_information}"
)
partition_id = 0
for partition_key, partition_type in partition_information.items():
table[partition_key] = partition_values[partition_id]
table = cast_column_type(table, partition_key, partition_type)
partition_id += 1
tables.append(table)
return dd.concat(tables)
location = table_information["Location"]
df = wrapped_read_function(location, column_information, **kwargs)
return df
def _parse_hive_table_description(
self,
cursor: Union["sqlalchemy.engine.base.Connection", "hive.Cursor"],
schema: str,
table_name: str,
partition: str = None,
):
"""
Extract all information from the output
of the DESCRIBE FORMATTED call, which is unfortunately
in a format not easily readable by machines.
"""
cursor.execute(
sqlalchemy.text(f"USE {schema}")
if self.is_sqlalchemy_hive(cursor)
else f"USE {schema}"
)
if partition:
# Hive wants quoted, comma separated list of partition keys
partition = partition.replace("=", '="')
partition = partition.replace("/", '",') + '"'
result = self._fetch_all_results(
cursor, f"DESCRIBE FORMATTED {table_name} PARTITION ({partition})"
)
else:
result = self._fetch_all_results(cursor, f"DESCRIBE FORMATTED {table_name}")
logger.debug(f"Got information from hive: {result}")
table_information = {}
column_information = {} # using the fact that dicts are insertion ordered
storage_information = {}
partition_information = {}
mode = "column"
last_field = None
for key, value, value2 in result:
key = key.strip().rstrip(":") if key else ""
value = value.strip() if value else ""
value2 = value2.strip() if value2 else ""
# That is just a comment line, we can skip it
if key == "# col_name":
continue
if (
key == "# Detailed Table Information"
or key == "# Detailed Partition Information"
):
mode = "table"
elif key == "# Storage Information":
mode = "storage"
elif key == "# Partition Information":
mode = "partition"
elif key.startswith("#"):
mode = None # pragma: no cover
elif key:
if not value:
value = dict()
if mode == "column":
column_information[key] = value
last_field = column_information[key]
elif mode == "storage":
storage_information[key] = value
last_field = storage_information[key]
elif mode == "table":
# Hive partition values come in a bracketed list
# quoted partition values work regardless of partition column type
if key == "Partition Value":
value = '"' + value.strip("[]") + '"'
table_information[key] = value
last_field = table_information[key]
elif mode == "partition":
partition_information[key] = value
last_field = partition_information[key]
else: # pragma: no cover
# prevent python to optimize it away and make coverage not respect the
# pragma
dummy = 0 # noqa: F841
elif value and last_field is not None:
last_field[value] = value2
return (
column_information,
table_information,
storage_information,
partition_information,
)
def _parse_hive_partition_description(
self,
cursor: Union["sqlalchemy.engine.base.Connection", "hive.Cursor"],
schema: str,
table_name: str,
):
"""
Extract all partition informaton for a given table
"""
cursor.execute(
sqlalchemy.text(f"USE {schema}")
if self.is_sqlalchemy_hive(cursor)
else f"USE {schema}"
)
result = self._fetch_all_results(cursor, f"SHOW PARTITIONS {table_name}")
return [row[0] for row in result]
def _fetch_all_results(
self,
cursor: Union["sqlalchemy.engine.base.Connection", "hive.Cursor"],
sql: str,
):
"""
The pyhive.Cursor and the sqlalchemy connection behave slightly different.
The former has the fetchall method on the cursor,
whereas the latter on the executed query.
"""
result = cursor.execute(
sqlalchemy.text(sql) if self.is_sqlalchemy_hive(cursor) else sql
)
try:
return result.fetchall()
except AttributeError: # pragma: no cover
return cursor.fetchall()
================================================
FILE: dask_sql/input_utils/intake.py
================================================
from typing import Any
try:
import intake
except ImportError: # pragma: no cover
intake = None
from dask_sql.input_utils.base import BaseInputPlugin
class IntakeCatalogInputPlugin(BaseInputPlugin):
"""Input Plugin for Intake Catalogs, getting the table in dask format"""
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
return intake and (
isinstance(input_item, intake.catalog.Catalog) or format == "intake"
)
def to_dc(
self,
input_item: Any,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs,
):
if gpu: # pragma: no cover
raise NotImplementedError("Intake does not support gpu")
table_name = kwargs.pop("intake_table_name", table_name)
catalog_kwargs = kwargs.pop("catalog_kwargs", {})
if isinstance(input_item, str):
input_item = intake.open_catalog(input_item, **catalog_kwargs)
return input_item[table_name].to_dask(**kwargs)
================================================
FILE: dask_sql/input_utils/location.py
================================================
import os
from typing import Any
import dask.dataframe as dd
from distributed.client import default_client
from dask_sql.input_utils.base import BaseInputPlugin
from dask_sql.input_utils.convert import InputUtil
class LocationInputPlugin(BaseInputPlugin):
"""Input Plugin for everything, which can be read in from a file (on disk, remote etc.)"""
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
return isinstance(input_item, str)
def to_dc(
self,
input_item: Any,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs,
):
if format == "memory":
client = default_client()
df = client.get_dataset(input_item, **kwargs)
plugin_list = InputUtil.get_plugins()
for plugin in plugin_list:
if plugin.is_correct_input(df, table_name, format, **kwargs):
return plugin.to_dc(df, table_name, format, gpu, **kwargs)
if not format:
_, extension = os.path.splitext(input_item)
format = extension.lstrip(".")
try:
if gpu: # pragma: no cover
try:
import dask_cudf
except ImportError:
raise ModuleNotFoundError(
"Setting `gpu=True` for table creation requires dask-cudf"
)
read_function = getattr(dask_cudf, f"read_{format}")
else:
read_function = getattr(dd, f"read_{format}")
except AttributeError:
raise AttributeError(f"Can not read files of format {format}")
return read_function(input_item, **kwargs)
================================================
FILE: dask_sql/input_utils/pandaslike.py
================================================
import dask.dataframe as dd
import pandas as pd
from dask_sql.input_utils.base import BaseInputPlugin
class PandasLikeInputPlugin(BaseInputPlugin):
"""Input Plugin for Pandas Like DataFrames, which get converted to dask DataFrames"""
def is_correct_input(
self, input_item, table_name: str, format: str = None, **kwargs
):
return (
dd.utils.is_dataframe_like(input_item)
and not isinstance(input_item, dd.DataFrame)
) or format == "dask"
def to_dc(
self,
input_item,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs,
):
npartitions = kwargs.pop("npartitions", 1)
if gpu: # pragma: no cover
try:
import cudf
except ImportError:
raise ModuleNotFoundError(
"Setting `gpu=True` for table creation requires cudf"
)
if isinstance(input_item, pd.DataFrame):
input_item = cudf.from_pandas(input_item)
return dd.from_pandas(input_item, npartitions=npartitions, **kwargs)
================================================
FILE: dask_sql/input_utils/sqlalchemy.py
================================================
from typing import Any
from dask_sql.input_utils.hive import HiveInputPlugin
class SqlalchemyHiveInputPlugin(HiveInputPlugin):
"""Input Plugin from sqlalchemy string"""
def is_correct_input(
self, input_item: Any, table_name: str, format: str = None, **kwargs
):
correct_prefix = isinstance(input_item, str) and (
input_item.startswith("hive://")
or input_item.startswith("databricks+pyhive://")
)
return correct_prefix
def to_dc(
self,
input_item: Any,
table_name: str,
format: str = None,
gpu: bool = False,
**kwargs
): # pragma: no cover
if gpu:
raise NotImplementedError("Hive does not support gpu")
import sqlalchemy
engine_kwargs = {}
if "connect_args" in kwargs:
engine_kwargs["connect_args"] = kwargs.pop("connect_args")
if format is not None:
raise AttributeError(
"Format specified and sqlalchemy connection string set!"
)
cursor = sqlalchemy.create_engine(input_item, **engine_kwargs).connect()
return super().to_dc(cursor, table_name=table_name, **kwargs)
================================================
FILE: dask_sql/integrations/__init__.py
================================================
================================================
FILE: dask_sql/integrations/fugue.py
================================================
try:
import fugue
import fugue_dask
from dask.distributed import Client
from fugue import WorkflowDataFrame, register_execution_engine
from fugue_sql import FugueSQLWorkflow
from triad import run_at_def
from triad.utils.convert import get_caller_global_local_vars
except ImportError: # pragma: no cover
raise ImportError(
"Can not load the fugue module. If you want to use this integration, you need to install it."
)
from typing import Any, Optional
import dask.dataframe as dd
from dask_sql.context import Context
@run_at_def
def _register_engines() -> None:
"""Register (overwrite) the default Dask execution engine of Fugue. This
function is invoked as an entrypoint, users don't need to call it explicitly.
"""
register_execution_engine(
"dask",
lambda conf, **kwargs: DaskSQLExecutionEngine(conf=conf),
on_dup="overwrite",
)
register_execution_engine(
Client,
lambda engine, conf, **kwargs: DaskSQLExecutionEngine(
dask_client=engine, conf=conf
),
on_dup="overwrite",
)
class DaskSQLEngine(fugue.execution.execution_engine.SQLEngine):
"""
SQL engine for fugue which uses dask-sql instead of the native
SQL implementation.
Please note, that so far the native SQL engine in fugue
understands a larger set of SQL commands, but in turns is
(on average) slower in computation and scaling.
"""
def __init__(self, *args, **kwargs):
"""Create a new instance."""
super().__init__(*args, **kwargs)
@property
def is_distributed(self) -> bool:
return True
def select(
self, dfs: fugue.dataframe.DataFrames, statement: str
) -> fugue.dataframe.DataFrame:
"""Send the SQL command to the dask-sql context and register all temporary dataframes"""
c = Context()
for k, v in dfs.items():
c.create_table(k, self.execution_engine.to_df(v).native)
df = c.sql(statement)
return fugue_dask.dataframe.DaskDataFrame(df)
class DaskSQLExecutionEngine(fugue_dask.DaskExecutionEngine):
"""
Execution engine for fugue which has dask-sql as SQL engine
configured.
Please note, that so far the native SQL engine in fugue
understands a larger set of SQL commands, but in turns is
(on average) slower in computation and scaling.
"""
def __init__(self, *args, **kwargs):
"""Create a new instance."""
super().__init__(*args, **kwargs)
self._default_sql_engine = DaskSQLEngine(self)
@property
def default_sql_engine(self) -> fugue.execution.execution_engine.SQLEngine:
return self._default_sql_engine
def fsql_dask(
sql: str,
ctx: Optional[Context] = None,
register: bool = False,
fugue_conf: Any = None,
) -> dict[str, dd.DataFrame]:
"""FugueSQL utility function that can consume Context directly. FugueSQL is a language
extending standard SQL. It makes SQL eligible to describe end to end workflows. It also
enables you to invoke python extensions in the SQL like language.
For more, please read
`FugueSQL Tutorial `_
Args:
sql (:obj:`str`): Fugue SQL statement
ctx (:class:`dask_sql.Context`): The context to operate on, defaults to None
register (:obj:`bool`): Whether to register named steps back to the context
(if provided), defaults to False
fugue_conf (:obj:`Any`): a dictionary like object containing Fugue specific configs
Example:
.. code-block:: python
# define a custom prepartition function for FugueSQL
def median(df: pd.DataFrame) -> pd.DataFrame:
df["y"] = df["y"].median()
return df.head(1)
# create a context with some tables
c = Context()
...
# run a FugueSQL query using the context as input
query = '''
j = SELECT df1.*, df2.x
FROM df1 INNER JOIN df2 ON df1.key = df2.key
PERSIST
TAKE 5 ROWS PREPARTITION BY x PRESORT key
PRINT
TRANSFORM j PREPARTITION BY x USING median
PRINT
'''
result = fsql_dask(query, c, register=True)
assert "j" in result
assert "j" in c.tables
"""
_global, _local = get_caller_global_local_vars()
dag = FugueSQLWorkflow()
dfs = (
{}
if ctx is None
else {k: dag.df(v.df) for k, v in ctx.schema[ctx.schema_name].tables.items()}
)
result = dag._sql(sql, _global, _local, **dfs)
dag.run(DaskSQLExecutionEngine(conf=fugue_conf))
result_dfs = {
k: v.result.native
for k, v in result.items()
if isinstance(v, WorkflowDataFrame)
}
if register and ctx is not None:
for k, v in result_dfs.items():
ctx.create_table(k, v)
return result_dfs
================================================
FILE: dask_sql/integrations/ipython.py
================================================
import time
from typing import TYPE_CHECKING
from dask_sql.mappings import _SQL_TO_PYTHON_FRAMES
from dask_sql.physical.rex.core import RexCallPlugin
if TYPE_CHECKING:
import dask_sql
# That is definitely not pretty, but there seems to be no better way...
KEYWORDS = [
"and",
"as",
"asc",
"between",
"by",
"columns",
"count",
"create",
"delete",
"desc",
"describe",
"distinct",
"exists",
"from",
"group",
"having",
"if",
"in",
"inner",
"insert",
"into",
"is",
"join",
"left",
"like",
"model",
"not",
"on",
"or",
"order",
"outer",
"right",
"schemas",
"select",
"set",
"show",
"table",
"union",
"where",
]
def ipython_integration(
context: "dask_sql.Context",
auto_include: bool,
disable_highlighting: bool,
) -> None: # pragma: no cover
"""Integrate the context with jupyter notebooks. Have a look into :ref:`Context.ipython_magic`."""
_register_ipython_magic(context, auto_include=auto_include)
if not disable_highlighting:
_register_syntax_highlighting()
def _register_ipython_magic(
c: "dask_sql.Context", auto_include: bool
) -> None: # pragma: no cover
from IPython.core.magic import needs_local_scope, register_line_cell_magic
@needs_local_scope
def sql(line, cell, local_ns):
if cell is None:
# the magic function was called inline
cell = line
sql_statement = cell.format(**local_ns)
dataframes = {}
if auto_include:
dataframes = c._get_tables_from_stack()
t0 = time.time()
res = c.sql(sql_statement, return_futures=False, dataframes=dataframes)
if (
"CREATE OR REPLACE TABLE" in sql_statement
or "CREATE OR REPLACE VIEW" in sql_statement
):
table = sql_statement.split("CREATE OR REPLACE")[1]
table = table.replace("TABLE", "").replace("VIEW", "").split()[0].strip()
res = c.sql(f"SELECT * FROM {table}").tail()
elif "CREATE TABLE" in sql_statement or "CREATE VIEW" in sql_statement:
table = sql_statement.split("CREATE")[1]
table = table.replace("TABLE", "").replace("VIEW", "").split()[0].strip()
res = c.sql(f"SELECT * FROM {table}").tail()
print(f"Execution time: {time.time() - t0:.2f}s")
return res
# Register a new magic function
magic_func = register_line_cell_magic(sql)
magic_func.MAGIC_NO_VAR_EXPAND_ATTR = True
def _register_syntax_highlighting(): # pragma: no cover
import json
from IPython.core import display
# JS snippet to use the created mime type highlighthing
_JS_ENABLE_DASK_SQL = r"""
require(['notebook/js/codecell'], function(codecell) {
codecell.CodeCell.options_default.highlight_modes['magic_text/x-dasksql'] = {'reg':[/%%sql/]} ;
Jupyter.notebook.events.on('kernel_ready.Kernel', function(){
Jupyter.notebook.get_cells().map(function(cell){
if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;
});
});
"""
types = map(str, _SQL_TO_PYTHON_FRAMES.keys())
functions = list(RexCallPlugin.OPERATION_MAPPING.keys())
# Create a new mimetype
mime_type = {
"name": "sql",
"keywords": _create_set(KEYWORDS + functions),
"builtin": _create_set(types),
"atoms": _create_set(["false", "true", "null"]),
# "operatorChars": /^[*\/+\-%<>!=~&|^]/,
"dateSQL": _create_set(["time"]),
# More information
# https://opensource.apple.com/source/WebInspectorUI/WebInspectorUI-7600.8.3/UserInterface/External/CodeMirror/sql.js.auto.html
"support": _create_set(["ODBCdotTable", "doubleQuote", "zerolessFloat"]),
}
# Code original from fugue-sql, adjusted for dask-sql and using some more customizations
js = (
r"""
require(["codemirror/lib/codemirror"]);
// We define a new mime type for syntax highlighting
CodeMirror.defineMIME("text/x-dasksql", """
+ json.dumps(mime_type)
+ r"""
);
CodeMirror.modeInfo.push({
name: "Dask SQL",
mime: "text/x-dasksql",
mode: "sql"
});
"""
)
display.display_javascript(js + _JS_ENABLE_DASK_SQL, raw=True)
def _create_set(keys: list[str]) -> dict[str, bool]: # pragma: no cover
"""Small helper function to turn a list into the correct format for codemirror"""
return {key: True for key in keys}
================================================
FILE: dask_sql/mappings.py
================================================
import logging
from datetime import datetime
from typing import Any
import dask.array as da
import dask.config as dask_config
import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName
logger = logging.getLogger(__name__)
# Default mapping between python types and SQL types
_PYTHON_TO_SQL = {
np.float64: SqlTypeName.DOUBLE,
pd.Float64Dtype(): SqlTypeName.DOUBLE,
float: SqlTypeName.FLOAT,
np.float32: SqlTypeName.FLOAT,
pd.Float32Dtype(): SqlTypeName.FLOAT,
np.int64: SqlTypeName.BIGINT,
pd.Int64Dtype(): SqlTypeName.BIGINT,
int: SqlTypeName.INTEGER,
np.int32: SqlTypeName.INTEGER,
pd.Int32Dtype(): SqlTypeName.INTEGER,
np.int16: SqlTypeName.SMALLINT,
pd.Int16Dtype(): SqlTypeName.SMALLINT,
np.int8: SqlTypeName.TINYINT,
pd.Int8Dtype(): SqlTypeName.TINYINT,
np.uint64: SqlTypeName.BIGINT,
pd.UInt64Dtype(): SqlTypeName.BIGINT,
np.uint32: SqlTypeName.INTEGER,
pd.UInt32Dtype(): SqlTypeName.INTEGER,
np.uint16: SqlTypeName.SMALLINT,
pd.UInt16Dtype(): SqlTypeName.SMALLINT,
np.uint8: SqlTypeName.TINYINT,
pd.UInt8Dtype(): SqlTypeName.TINYINT,
np.bool_: SqlTypeName.BOOLEAN,
pd.BooleanDtype(): SqlTypeName.BOOLEAN,
str: SqlTypeName.VARCHAR,
np.object_: SqlTypeName.VARCHAR,
pd.StringDtype(): SqlTypeName.VARCHAR,
np.datetime64: SqlTypeName.TIMESTAMP,
}
# Default mapping between SQL types and python types
# for values
_SQL_TO_PYTHON_SCALARS = {
"SqlTypeName.DOUBLE": np.float64,
"SqlTypeName.FLOAT": np.float32,
"SqlTypeName.DECIMAL": np.float32,
"SqlTypeName.BIGINT": np.int64,
"SqlTypeName.INTEGER": np.int32,
"SqlTypeName.SMALLINT": np.int16,
"SqlTypeName.TINYINT": np.int8,
"SqlTypeName.BOOLEAN": np.bool_,
"SqlTypeName.VARCHAR": str,
"SqlTypeName.CHAR": str,
"SqlTypeName.NULL": type(None),
"SqlTypeName.SYMBOL": lambda x: x, # SYMBOL is a special type used for e.g. flags etc. We just keep it
}
# Default mapping between SQL types and python types
# for data frames
_SQL_TO_PYTHON_FRAMES = {
"SqlTypeName.DOUBLE": np.float64,
"SqlTypeName.FLOAT": np.float32,
"SqlTypeName.DECIMAL": np.float64, # We use np.float64 always, even though we might be able to use a smaller type
"SqlTypeName.BIGINT": pd.Int64Dtype(),
"SqlTypeName.INTEGER": pd.Int32Dtype(),
"SqlTypeName.SMALLINT": pd.Int16Dtype(),
"SqlTypeName.TINYINT": pd.Int8Dtype(),
"SqlTypeName.BOOLEAN": pd.BooleanDtype(),
"SqlTypeName.VARCHAR": pd.StringDtype(),
"SqlTypeName.CHAR": pd.StringDtype(),
"SqlTypeName.DATE": np.dtype(
" "DaskTypeMap":
"""Mapping between python and SQL types."""
if python_type in (int, float):
python_type = np.dtype(python_type)
elif python_type is str:
python_type = np.dtype("object")
if isinstance(python_type, np.dtype):
python_type = python_type.type
if isinstance(python_type, pd.DatetimeTZDtype):
return DaskTypeMap(
SqlTypeName.TIMESTAMP_WITH_LOCAL_TIME_ZONE,
unit=str(python_type.unit),
tz=str(python_type.tz),
)
if is_decimal(python_type):
return DaskTypeMap(
SqlTypeName.DECIMAL,
precision=python_type.precision,
scale=python_type.scale,
)
try:
return DaskTypeMap(_PYTHON_TO_SQL[python_type])
except KeyError: # pragma: no cover
raise NotImplementedError(
f"The python type {python_type} is not implemented (yet)"
)
def parse_datetime(obj):
formats = [
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d",
"%d-%m-%Y %H:%M:%S",
"%d-%m-%Y",
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y",
]
for f in formats:
try:
datetime_obj = datetime.strptime(obj, f)
return datetime_obj
except ValueError:
pass
raise ValueError("Unable to parse datetime: " + obj)
def sql_to_python_value(sql_type: "SqlTypeName", literal_value: Any) -> Any:
"""Mapping between SQL and python values (of correct type)."""
# In most of the cases, we turn the value first into a string.
# That might not be the most efficient thing to do,
# but works for all types (so far)
# Additionally, a literal type is not used
# so often anyways.
logger.debug(
f"sql_to_python_value -> sql_type: {sql_type} literal_value: {literal_value}"
)
if sql_type == SqlTypeName.CHAR or sql_type == SqlTypeName.VARCHAR:
# Some varchars contain an additional encoding
# in the format _ENCODING'string'
literal_value = str(literal_value)
if literal_value.startswith("_"):
encoding, literal_value = literal_value.split("'", 1)
literal_value = literal_value.rstrip("'")
literal_value = literal_value.encode(encoding=encoding)
return literal_value.decode(encoding=encoding)
return literal_value
elif (
sql_type == SqlTypeName.DECIMAL
and dask_config.get("sql.mappings.decimal_support") == "cudf"
):
from decimal import Decimal
python_type = Decimal
elif sql_type == SqlTypeName.INTERVAL_DAY:
return np.timedelta64(literal_value[0], "D") + np.timedelta64(
literal_value[1], "ms"
)
elif sql_type == SqlTypeName.INTERVAL:
# check for finer granular interval types, e.g., INTERVAL MONTH, INTERVAL YEAR
try:
interval_type = str(sql_type).split()[1].lower()
if interval_type in {"year", "quarter", "month"}:
# if sql_type is INTERVAL YEAR, Calcite will covert to months
delta = pd.tseries.offsets.DateOffset(months=float(str(literal_value)))
return delta
except IndexError: # pragma: no cover
# no finer granular interval type specified
pass
except TypeError: # pragma: no cover
# interval type is not recognized, fall back to default case
pass
# Calcite will always convert INTERVAL types except YEAR, QUATER, MONTH to milliseconds
# Issue: if sql_type is INTERVAL MICROSECOND, and value <= 1000, literal_value will be rounded to 0
return np.timedelta64(literal_value, "ms")
elif sql_type == SqlTypeName.INTERVAL_MONTH_DAY_NANOSECOND:
# DataFusion assumes 30 days per month. Therefore we multiply number of months by 30 and add to days
return np.timedelta64(
(literal_value[0] * 30) + literal_value[1], "D"
) + np.timedelta64(literal_value[2], "ns")
elif sql_type == SqlTypeName.BOOLEAN:
return bool(literal_value)
elif (
sql_type == SqlTypeName.TIMESTAMP
or sql_type == SqlTypeName.TIME
or sql_type == SqlTypeName.DATE
):
if isinstance(literal_value, str):
literal_value = parse_datetime(literal_value)
literal_value = np.datetime64(literal_value)
elif str(literal_value) == "None":
# NULL time
return pd.NaT # pragma: no cover
if sql_type == SqlTypeName.DATE:
return literal_value.astype(" type:
"""Turn an SQL type into a dataframe dtype"""
try:
if (
sql_type == SqlTypeName.DECIMAL
and dask_config.get("sql.mappings.decimal_support") == "cudf"
):
try:
import cudf
except ImportError:
raise ModuleNotFoundError(
"Setting `sql.mappings.decimal_support=cudf` requires cudf"
)
return cudf.Decimal128Dtype(*args)
return _SQL_TO_PYTHON_FRAMES[str(sql_type)]
except KeyError: # pragma: no cover
raise NotImplementedError(
f"The SQL type {str(sql_type)} is not implemented (yet)"
)
def similar_type(lhs: type, rhs: type) -> bool:
"""
Measure simularity between types.
Two types are similar, if they both come from the same family,
e.g. both are ints, uints, floats, strings etc.
Size or precision is not taken into account.
TODO: nullability is not checked so far.
"""
pdt = pd.api.types
is_uint = pdt.is_unsigned_integer_dtype
is_sint = pdt.is_signed_integer_dtype
is_float = pdt.is_float_dtype
is_object = pdt.is_object_dtype
is_string = pdt.is_string_dtype
is_dt_ns = pdt.is_datetime64_ns_dtype
is_dt_tz = lambda t: is_dt_ns(t) and isinstance(t, pd.DatetimeTZDtype)
is_dt_ntz = lambda t: is_dt_ns(t) and not isinstance(t, pd.DatetimeTZDtype)
is_td_ns = pdt.is_timedelta64_ns_dtype
is_bool = pdt.is_bool_dtype
checks = [
is_uint,
is_sint,
is_float,
is_object,
# is_string_dtype considers decimal columns to be string columns
lambda x: is_string(x) and not is_decimal(x),
is_dt_tz,
is_dt_ntz,
is_td_ns,
is_bool,
is_decimal,
]
for check in checks:
if check(lhs) and check(rhs):
# check that decimal columns have equal precision/scale
if check is is_decimal:
return lhs.precision == rhs.precision and lhs.scale == rhs.scale
return True
return False
def cast_column_type(
df: dd.DataFrame, column_name: str, expected_type: type
) -> dd.DataFrame:
"""
Cast the type of the given column to the expected type,
if they are far "enough" away.
This means, a float will never be converted into a double
or a tinyint into another int - but a string to an integer etc.
"""
current_type = df[column_name].dtype
logger.debug(
f"Column {column_name} has type {current_type}, expecting {expected_type}..."
)
casted_column = cast_column_to_type(df[column_name], expected_type)
if casted_column is not None:
df[column_name] = casted_column
return df
def cast_column_to_type(col: dd.Series, expected_type: str):
"""Cast the given column to the expected type"""
pdt = pd.api.types
is_dt_ns = pdt.is_datetime64_ns_dtype
is_dt_tz = lambda t: is_dt_ns(t) and isinstance(t, pd.DatetimeTZDtype)
is_dt_ntz = lambda t: is_dt_ns(t) and not isinstance(t, pd.DatetimeTZDtype)
current_type = col.dtype
if similar_type(current_type, expected_type):
logger.debug("...not converting.")
return None
if pdt.is_integer_dtype(expected_type):
if pd.api.types.is_float_dtype(current_type):
logger.debug("...truncating...")
# Currently "trunc" can not be applied to NA (the pandas missing value type),
# because NA is a different type. It works with np.NaN though.
# For our use case, that does not matter, as the conversion to integer later
# will convert both NA and np.NaN to NA.
col = da.trunc(col.fillna(value=np.NaN))
elif pdt.is_timedelta64_dtype(current_type):
logger.debug(f"Explicitly casting from {current_type} to np.int64")
return col.astype(np.int64)
if is_dt_tz(current_type) and is_dt_ntz(expected_type):
# casting from timezone-aware to timezone-naive datatypes with astype is deprecated in pandas 2
return col.dt.tz_localize(None)
logger.debug(f"Need to cast from {current_type} to {expected_type}")
return col.astype(expected_type)
def is_decimal(dtype):
"""
Check if dtype is a decimal type
"""
return "decimal" in str(dtype).lower()
================================================
FILE: dask_sql/physical/__init__.py
================================================
================================================
FILE: dask_sql/physical/rel/__init__.py
================================================
from .convert import RelConverter
================================================
FILE: dask_sql/physical/rel/base.py
================================================
import logging
from typing import TYPE_CHECKING, Optional
import dask.dataframe as dd
from dask_sql.datacontainer import ColumnContainer, DataContainer
from dask_sql.mappings import cast_column_type, sql_to_python_type
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan, RelDataType
logger = logging.getLogger(__name__)
class BaseRelPlugin:
"""
Base class for all plugins to convert between
a RelNode to a python expression (dask dataframe).
Derived classed needs to override the class_name attribute
and the convert method.
"""
class_name = None
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> dd.DataFrame:
"""Base method to implement"""
raise NotImplementedError
@staticmethod
def fix_column_to_row_type(
cc: ColumnContainer, row_type: "RelDataType", join_type: Optional[str] = None
) -> ColumnContainer:
"""
Make sure that the given column container
has the column names specified by the row type.
We assume that the column order is already correct
and will just "blindly" rename the columns.
"""
field_names = [str(x) for x in row_type.getFieldNames()]
if join_type in ("leftsemi", "leftanti"):
field_names = field_names[: len(cc.columns)]
logger.debug(f"Renaming {cc.columns} to {field_names}")
cc = cc.rename_handle_duplicates(
from_columns=cc.columns, to_columns=field_names
)
# TODO: We can also check for the types here and do any conversions if needed
return cc.limit_to(field_names)
@staticmethod
def check_columns_from_row_type(df: dd.DataFrame, row_type: "RelDataType"):
"""
Similar to `self.fix_column_to_row_type`, but this time
check for the correct column names instead of
applying them.
"""
field_names = [str(x) for x in row_type.getFieldNames()]
assert list(df.columns) == field_names
# TODO: similar to self.fix_column_to_row_type, we should check for the types
@staticmethod
def assert_inputs(
rel: "LogicalPlan",
n: int = 1,
context: "dask_sql.Context" = None,
) -> list[dd.DataFrame]:
"""
LogicalPlan nodes build on top of others.
Those are called the "input" of the LogicalPlan.
This function asserts that the given LogicalPlan has exactly as many
input tables as expected and returns them already
converted into a dask dataframe.
"""
input_rels = rel.get_inputs()
assert len(input_rels) == n
# Late import to remove cycling dependency
from dask_sql.physical.rel.convert import RelConverter
return [RelConverter.convert(input_rel, context) for input_rel in input_rels]
@staticmethod
def fix_dtype_to_row_type(
dc: DataContainer, row_type: "RelDataType", join_type: Optional[str] = None
):
"""
Fix the dtype of the given data container (or: the df within it)
to the data type given as argument.
To prevent unneeded conversions, do only convert if really needed,
e.g. if the two types are "similar" enough, do not convert.
Similarity involves the same general type (int, float, string etc)
but not necessary the size (int64 and int32 are compatible)
or the nullability.
TODO: we should check the nullability of the SQL type
"""
df = dc.df
cc = dc.column_container
field_list = row_type.getFieldList()
if join_type in ("leftsemi", "leftanti"):
field_list = field_list[: len(cc.columns)]
field_types = {
str(field.getQualifiedName()): field.getType() for field in field_list
}
for field_name, field_type in field_types.items():
sql_type = field_type.getSqlType()
sql_type_args = tuple()
if str(sql_type) == "SqlTypeName.DECIMAL":
sql_type_args = field_type.getDataType().getPrecisionScale()
expected_type = sql_to_python_type(sql_type, *sql_type_args)
df_field_name = cc.get_backend_by_frontend_name(field_name)
df = cast_column_type(df, df_field_name, expected_type)
return DataContainer(df, dc.column_container)
================================================
FILE: dask_sql/physical/rel/convert.py
================================================
import logging
from typing import TYPE_CHECKING
import dask.dataframe as dd
from dask_sql.physical.rel.base import BaseRelPlugin
from dask_sql.utils import LoggableDataFrame, Pluggable
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan
logger = logging.getLogger(__name__)
class RelConverter(Pluggable):
"""
Helper to convert from rel to a python expression
This class stores plugins which can convert from RelNodes to
python expression (typically dask dataframes).
The stored plugins are assumed to have a class attribute "class_name"
to control, which java classes they can convert
and they are expected to have a convert (instance) method
in the form
def convert(self, rel, context)
to do the actual conversion.
"""
@classmethod
def add_plugin_class(cls, plugin_class: BaseRelPlugin, replace=True):
"""Convenience function to add a class directly to the plugins"""
logger.debug(f"Registering REL plugin for {plugin_class.class_name}")
cls.add_plugin(plugin_class.class_name, plugin_class(), replace=replace)
@classmethod
def convert(cls, rel: "LogicalPlan", context: "dask_sql.Context") -> dd.DataFrame:
"""
Convert SQL AST tree node(s)
into a python expression (a dask dataframe)
using the stored plugins and the dictionary of
registered dask tables from the context.
The SQL AST tree is traversed. The context of the traversal is saved
in the Rust logic. We need to take that current node and determine
what "type" of Relational operator it represents to build the execution chain.
"""
node_type = rel.get_current_node_type()
try:
plugin_instance = cls.get_plugin(node_type)
except KeyError: # pragma: no cover
raise NotImplementedError(
f"No relational conversion for node type {node_type} available (yet)."
)
logger.debug(
f"Processing REL {rel} using {plugin_instance.__class__.__name__}..."
)
df = plugin_instance.convert(rel, context=context)
logger.debug(f"Processed REL {rel} into {LoggableDataFrame(df)}")
return df
================================================
FILE: dask_sql/physical/rel/custom/__init__.py
================================================
from .alter import AlterSchemaPlugin, AlterTablePlugin
from .analyze_table import AnalyzeTablePlugin
from .create_catalog_schema import CreateCatalogSchemaPlugin
from .create_experiment import CreateExperimentPlugin
from .create_memory_table import CreateMemoryTablePlugin
from .create_model import CreateModelPlugin
from .create_table import CreateTablePlugin
from .describe_model import DescribeModelPlugin
from .distributeby import DistributeByPlugin
from .drop_model import DropModelPlugin
from .drop_schema import DropSchemaPlugin
from .drop_table import DropTablePlugin
from .export_model import ExportModelPlugin
from .predict_model import PredictModelPlugin
from .show_columns import ShowColumnsPlugin
from .show_models import ShowModelsPlugin
from .show_schemas import ShowSchemasPlugin
from .show_tables import ShowTablesPlugin
from .use_schema import UseSchemaPlugin
__all__ = [
AnalyzeTablePlugin,
CreateExperimentPlugin,
CreateModelPlugin,
CreateCatalogSchemaPlugin,
CreateMemoryTablePlugin,
CreateTablePlugin,
DropModelPlugin,
DropSchemaPlugin,
DropTablePlugin,
ExportModelPlugin,
PredictModelPlugin,
ShowColumnsPlugin,
DescribeModelPlugin,
ShowModelsPlugin,
ShowSchemasPlugin,
ShowTablesPlugin,
UseSchemaPlugin,
AlterSchemaPlugin,
AlterTablePlugin,
DistributeByPlugin,
]
================================================
FILE: dask_sql/physical/rel/custom/alter.py
================================================
import logging
from typing import TYPE_CHECKING
from dask_sql.physical.rel.base import BaseRelPlugin
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan
class AlterSchemaPlugin(BaseRelPlugin):
"""
Alter schema name with new name;
ALTER SCHEMA RENAME TO
Using this SQL is equivalent to just doing
context.alter_schema(,)
but can also be used without writing a single line of code.
Nothing is returned.
"""
class_name = "AlterSchema"
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"):
alter_schema = rel.alter_schema()
old_schema_name = alter_schema.getOldSchemaName()
new_schema_name = alter_schema.getNewSchemaName()
logger.info(
f"changing schema name from `{old_schema_name}` to `{new_schema_name}`"
)
if old_schema_name not in context.schema:
raise KeyError(
f"Schema {old_schema_name} was not found, available schemas are - {context.schema.keys()}"
)
context.alter_schema(
old_schema_name=old_schema_name, new_schema_name=new_schema_name
)
class AlterTablePlugin(BaseRelPlugin):
"""
Alter table name with new name;
ALTER TABLE [IF EXISTS] RENAME TO
Using this SQL is equivalent to just doing
context.alter_table(,)
but can also be used without writing a single line of code.
Nothing is returned.
"""
class_name = "AlterTable"
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"):
alter_table = rel.alter_table()
old_table_name = alter_table.getOldTableName()
new_table_name = alter_table.getNewTableName()
schema_name = alter_table.getSchemaName() or context.schema_name
logger.info(
f"changing table name from `{old_table_name}` to `{new_table_name}`"
)
if old_table_name not in context.schema[schema_name].tables:
if not alter_table.getIfExists():
raise KeyError(
f"Table {old_table_name} was not found, available tables in {schema_name} are "
f"- {context.schema[schema_name].tables.keys()}"
)
else:
return
context.alter_table(
old_table_name=old_table_name,
new_table_name=new_table_name,
schema_name=schema_name,
)
================================================
FILE: dask_sql/physical/rel/custom/analyze_table.py
================================================
from typing import TYPE_CHECKING
import dask.dataframe as dd
import pandas as pd
from dask_sql.datacontainer import ColumnContainer, DataContainer
from dask_sql.mappings import python_to_sql_type
from dask_sql.physical.rel.base import BaseRelPlugin
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan
class AnalyzeTablePlugin(BaseRelPlugin):
"""
Show information on the table (like mean, max etc.)
on all or a subset of the columns..
The SQL is:
ANALYZE TABLE
COMPUTE STATISTICS FOR [ALL COLUMNS | COLUMNS a, b, ...]
The result is also a table, although it is created on the fly.
Please note: even though the syntax is very similar to e.g.
[the spark version](https://spark.apache.org/docs/3.0.0/sql-ref-syntax-aux-analyze-table.html),
this call does not help with query optimization (as the spark call would do),
as this is currently not implemented in dask-sql.
"""
class_name = "AnalyzeTable"
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer:
analyze_table = rel.analyze_table()
schema_name = analyze_table.getSchemaName() or context.schema_name
table_name = analyze_table.getTableName()
dc = context.schema[schema_name].tables[table_name]
columns = analyze_table.getColumns()
if not columns:
columns = dc.column_container.columns
# Define some useful shortcuts
mapping = dc.column_container.get_backend_by_frontend_name
df = dc.df
# Calculate statistics
statistics = dd.concat(
[
df[[mapping(col) for col in columns]].describe(),
pd.DataFrame(
{
mapping(col): str(
python_to_sql_type(df[mapping(col)].dtype)
).lower()
for col in columns
},
index=["data_type"],
),
pd.DataFrame(
{mapping(col): col for col in columns}, index=["col_name"]
),
]
)
cc = ColumnContainer(statistics.columns)
dc = DataContainer(statistics, cc)
return dc
================================================
FILE: dask_sql/physical/rel/custom/create_catalog_schema.py
================================================
import logging
from typing import TYPE_CHECKING
from dask_sql.physical.rel.base import BaseRelPlugin
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan
logger = logging.getLogger(__name__)
class CreateCatalogSchemaPlugin(BaseRelPlugin):
"""
Create a schema with the given name
and register it at the context.
The SQL call looks like
CREATE SCHEMA
Using this SQL is equivalent to just doing
context.create_schema()
but can also be used without writing a single line of code.
Nothing is returned.
"""
class_name = "CreateCatalogSchema"
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context"):
create_schema = rel.create_catalog_schema()
schema_name = create_schema.getSchemaName()
if schema_name in context.schema:
if create_schema.getIfNotExists():
return
elif not create_schema.getReplace():
raise RuntimeError(
f"A Schema with the name {schema_name} is already present."
)
context.create_schema(schema_name)
================================================
FILE: dask_sql/physical/rel/custom/create_experiment.py
================================================
import logging
from typing import TYPE_CHECKING
import dask.dataframe as dd
import pandas as pd
from dask_sql.datacontainer import ColumnContainer, DataContainer
from dask_sql.physical.rel.base import BaseRelPlugin
from dask_sql.physical.utils.ml_classes import get_cpu_classes, get_gpu_classes
from dask_sql.utils import convert_sql_kwargs, import_class, is_cudf_type
if TYPE_CHECKING:
import dask_sql
from dask_sql.rust import LogicalPlan
logger = logging.getLogger(__name__)
cpu_classes = get_cpu_classes()
gpu_classes = get_gpu_classes()
class CreateExperimentPlugin(BaseRelPlugin):
"""
Creates an Experiment for hyperparameter tuning or automl like behaviour,
i.e evaluates models with different hyperparameters and registers the best performing
model in the context with the name same as experiment name,
which can be used for prediction
sql syntax:
CREATE EXPERIMENT WITH ( key = value )
AS
OPTIONS:
* model_class: Class name or full path to the class of the model to train.
Any sklearn, cuML, XGBoost, or LightGBM classes can be inferred
without the full path. In this case, models trained on cuDF dataframes
are automatically mapped to cuML classes, and sklearn models otherwise.
We map to cuML-Dask based models when possible and single-GPU cuML models otherwise.
Any model class with sklearn interface is valid, but might or
might not work well with Dask dataframes.
You might need to install necessary packages to use
the models.
* experiment_class : Class name or full path of the Hyperparameter tuner.
Any sklearn or cuML classes can be inferred
without the full path. In this case, models trained on cuDF dataframes
are automatically mapped to cuML classes, and sklearn models otherwise.
* tune_parameters:
Key-value of pairs of Hyperparameters to tune, i.e Search Space for
particular model to tune
* automl_class : Full path of the class which is sklearn compatible and
able to distribute work to dask clusters, currently tested with
tpot automl framework.
Refer : [Tpot example](https://examples.dask.org/machine-learning/tpot.html)
* target_column: Which column from the data to use as target.
Currently this parameter is required field, because tuning and automl
behaviour is implemented only for supervised algorithms.
* automl_kwargs:
Key-value pairs of arguments to be passed to automl class .
Refer : [Using Tpot parameters](https://epistasislab.github.io/tpot/using/)
* experiment_kwargs:
Use this parameter for passing any keyword arguments to experiment class
* tune_fit_kwargs:
Use this parameter for passing any keyword arguments to experiment.fit() method
example:
for Hyperparameter tuning : (Train and evaluate same model with different parameters)
CREATE EXPERIMENT my_exp WITH(
model_class = 'sklearn.ensemble.GradientBoostingClassifier',
experiment_class = 'sklearn.model_selection.GridSearchCV',
tune_parameters = (n_estimators = ARRAY [16, 32, 2],
learning_rate = ARRAY [0.1,0.01,0.001],
max_depth = ARRAY [3,4,5,10]
),
target_column = 'target'
) AS (
SELECT x, y, x*y > 0 AS target
FROM timeseries
LIMIT 100
)
for automl : (Train different different model with different parameter)
CREATE EXPERIMENT my_exp WITH (
automl_class = 'tpot.TPOTClassifier',
automl_kwargs = (population_size = 2 ,
generations=2,
cv=2,
n_jobs=-1,
use_dask=True,
max_eval_time_mins=1),
target_column = 'target'
) AS (
SELECT x, y, x*y > 0 AS target
FROM timeseries
LIMIT 100
)
"""
class_name = "CreateExperiment"
def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer:
create_experiment = rel.create_experiment()
select = create_experiment.getSelectQuery()
schema_name = create_experiment.getSchemaName() or context.schema_name
experiment_name = create_experiment.getExperimentName()
kwargs = convert_sql_kwargs(create_experiment.getSQLWithOptions())
if experiment_name in context.schema[schema_name].experiments:
if create_experiment.getIfNotExists():
return
elif not create_experiment.getOrReplace():
raise RuntimeError(
f"A experiment with the name {experiment_name} is already present."
)
logger.debug(
f"Creating Experiment {experiment_name} from query {select} with options {kwargs}"
)
model_class = None
automl_class = None
experiment_class = None
if "model_class" in kwargs:
model_class = kwargs.pop("model_class")
# when model class was provided, must provide experiment_class also for tuning
if "experiment_class" not in kwargs:
raise ValueError(
f"Parameters must include a 'experiment_class' parameter for tuning {model_class}."
)
experiment_class = kwargs.pop("experiment_class")
elif "automl_class" in kwargs:
automl_class = kwargs.pop("automl_class")
else:
raise ValueError(
"Parameters must include a 'model_class' or 'automl_class' parameter."
)
target_column = kwargs.pop("target_column", "")
tune_fit_kwargs = kwargs.pop("tune_fit_kwargs", {})
parameters = kwargs.pop("tune_parameters", {})
experiment_kwargs = kwargs.pop("experiment_kwargs", {})
automl_kwargs = kwargs.pop("automl_kwargs", {})
logger.info(parameters)
training_df = context.sql(select)
if not target_column:
raise ValueError(
"Unsupervised Algorithm cannot be tuned Automatically,"
"Consider providing 'target column'"
)
non_target_columns = [
col for col in training_df.columns if col != target_column
]
X = training_df[non_target_columns]
y = training_df[target_column]
if model_class and experiment_class:
if is_cudf_type(training_df):
model_class = gpu_classes.get(model_class, model_class)
experiment_class = gpu_classes.get(experiment_class, experiment_class)
else:
model_class = cpu_classes.get(model_class, model_class)
experiment_class = cpu_classes.get(experiment_class, experiment_class)
try:
ModelClass = import_class(model_class)
except ImportError:
raise ValueError(
f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages."
)
try:
ExperimentClass = import_class(experiment_class)
except ImportError:
raise ValueError(
f"Can not import tuner {experiment_class}. Make sure you spelled it correctly and have installed all packages."
)
from dask_sql.physical.rel.custom.wrappers import ParallelPostFit
model = ModelClass()
search = ExperimentClass(model, {**parameters}, **experiment_kwargs)
logger.info(tune_fit_kwargs)
search.fit(
X.to_dask_array(lengths=True),
y.to_dask_array(lengths=True),
**tune_fit_kwargs,
)
df = pd.DataFrame(search.cv_results_)
df["model_class"] = model_class
context.register_model(
experiment_name,
ParallelPostFit(estimator=search.best_estimator_),
X.columns,
schema_name=schema_name,
)
if automl_class:
try:
AutoMLClass = import_class(automl_class)
except ImportError:
raise ValueError(
f"Can not import automl model {automl_class}. Make sure you spelled it correctly and have installed all packages."
)
from dask_sql.physical.rel.custom.wrappers import ParallelPostFit
automl = AutoMLClass(**automl_kwargs)
# should be avoided if data doesn't fit in memory
automl.fit(X.compute(), y.compute())
df = (
pd.DataFrame(automl.evaluated_individuals_)
.T.reset_index()
.rename({"index": "models"}, axis=1)
)
context.register_model(
experiment_name,
ParallelPostFit(estimator=automl.fitted_pipeline_),
X.columns,
schema_name=schema_name,
)
context.register_experiment(
experiment_name, experiment_results=df, schema_name=schema_name
)
cc = ColumnContainer(df.columns)
dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
return dc
================================================
FILE: dask_sql/physical/rel/custom/create_memory_table.py
================================================
import logging
from typing import TYPE_CHECKING
from dask_sql.datacontainer import DataContainer
from dask_sql.physical.rel.base import BaseRelPlugin
if TYPE_CHECKING:
import dask_sql
from dask_sql._datafusion_lib import LogicalPlan
logger = logging.getLogger(__name__)
class CreateMemoryTablePlugin(BaseRelPlugin):
"""
Create a table or view from the given SELECT query
and register it at the context.
The SQL call looks like
CREATE TABLE AS
It sends the select query through the normal parsing
and optimization and conversation before registering it.
Using this SQL is equivalent to just doing
df = context.sql("