Showing preview only (1,365K chars total). Download the full file or copy to clipboard to get everything.
Repository: apache/arrow-julia
Branch: main
Commit: 5ade01aca6de
Files: 80
Total size: 1.3 MB
Directory structure:
gitextract_uitozoi5/
├── .JuliaFormatter.toml
├── .asf.yaml
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── CompatHelper.yml
│ ├── TagBot.yml
│ ├── ci.yml
│ └── ci_nightly.yml
├── .gitignore
├── LICENSE
├── NOTICE
├── Project.toml
├── README.md
├── codecov.yaml
├── dev/
│ └── release/
│ ├── .dir-locals.el
│ ├── .gitignore
│ ├── README.md
│ ├── check_rat_report.py
│ ├── rat_exclude_files.txt
│ ├── release.sh
│ ├── release_rc.sh
│ ├── run_rat.sh
│ └── verify_rc.sh
├── docs/
│ ├── .gitignore
│ ├── Project.toml
│ ├── make.jl
│ └── src/
│ ├── index.md
│ ├── manual.md
│ └── reference.md
├── src/
│ ├── Arrow.jl
│ ├── ArrowTypes/
│ │ ├── LICENSE.md
│ │ ├── Project.toml
│ │ ├── src/
│ │ │ └── ArrowTypes.jl
│ │ └── test/
│ │ ├── Project.toml
│ │ ├── runtests.jl
│ │ └── tests.jl
│ ├── FlatBuffers/
│ │ ├── FlatBuffers.jl
│ │ ├── builder.jl
│ │ └── table.jl
│ ├── append.jl
│ ├── arraytypes/
│ │ ├── arraytypes.jl
│ │ ├── bool.jl
│ │ ├── compressed.jl
│ │ ├── dictencoding.jl
│ │ ├── fixedsizelist.jl
│ │ ├── list.jl
│ │ ├── map.jl
│ │ ├── primitive.jl
│ │ ├── struct.jl
│ │ ├── unions.jl
│ │ └── views.jl
│ ├── eltypes.jl
│ ├── metadata/
│ │ ├── File.jl
│ │ ├── Flatbuf.jl
│ │ ├── Message.jl
│ │ └── Schema.jl
│ ├── show.jl
│ ├── table.jl
│ ├── utils.jl
│ └── write.jl
└── test/
├── Project.toml
├── arrowjson/
│ ├── datetime.json
│ ├── decimal.json
│ ├── dictionary.json
│ ├── dictionary_unsigned.json
│ ├── map.json
│ ├── nested.json
│ ├── primitive-empty.json
│ ├── primitive.json
│ └── primitive_no_batches.json
├── arrowjson.jl
├── dates.jl
├── integrationtest.jl
├── java_compress_len_neg_one.arrow
├── java_compressed_zero_length.arrow
├── old_zdt.arrow
├── pyarrow_roundtrip.jl
├── reject_reason_trimmed.arrow
├── runtests.jl
├── testappend.jl
└── testtables.jl
================================================
FILE CONTENTS
================================================
================================================
FILE: .JuliaFormatter.toml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# https://github.com/domluna/JuliaFormatter.jl/blob/master/README.md
whitespace_ops_in_indices = true
remove_extra_newlines = true
whitespace_in_kwargs = false
================================================
FILE: .asf.yaml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features
github:
description: "Official Julia implementation of Apache Arrow"
homepage: https://arrow.apache.org/julia/
labels:
- apache-arrow
- julia
features:
issues: true
discussions: true
enabled_merge_buttons:
merge: false
rebase: true
squash: true
protected_branches:
main:
required_linear_history: true
notifications:
commits: commits@arrow.apache.org
issues_status: issues@arrow.apache.org
issues_comment: github@arrow.apache.org
pullrequests: github@arrow.apache.org
discussions: user@arrow.apache.org
# publishes the content of the `asf-site` branch to
# https://arrow.apache.org/julia/
publish:
whoami: asf-site
subdir: julia
================================================
FILE: .github/dependabot.yml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
================================================
FILE: .github/workflows/CompatHelper.yml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: CompatHelper
on:
schedule:
- cron: 0 0 * * *
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
CompatHelper:
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Check if Julia is already available in the PATH
id: julia_in_path
run: which julia
continue-on-error: true
- name: Install Julia, but only if it is not already available in the PATH
uses: julia-actions/setup-julia@v3
with:
version: '1'
arch: ${{ runner.arch }}
if: steps.julia_in_path.outcome != 'success'
- name: "Add the General registry via Git"
run: |
import Pkg
ENV["JULIA_PKG_SERVER"] = ""
Pkg.Registry.add("General")
shell: julia --color=yes {0}
- name: "Install CompatHelper"
run: |
import Pkg
name = "CompatHelper"
uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
version = "3"
Pkg.add(; name, uuid, version)
shell: julia --color=yes {0}
- name: "Run CompatHelper"
run: |
import CompatHelper
CompatHelper.main()
shell: julia --color=yes {0}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
# COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
================================================
FILE: .github/workflows/TagBot.yml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: TagBot
on:
issue_comment:
types:
- created
workflow_dispatch:
jobs:
TagBot:
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
================================================
FILE: .github/workflows/ci.yml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: CI
on:
pull_request:
push:
branches:
- '**'
- '!dependabot/**'
tags:
- '**'
jobs:
license:
name: Audit licenses
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
python-version: '3.x'
- name: Run Release audit tool (Rat)
run: dev/release/run_rat.sh .
release:
name: Verify release - ${{ matrix.os }}
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
os:
- macos-latest
- ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Install dependencies
if: matrix.os == 'macos-latest'
run: |
brew install julia subversion
- name: Install dependencies
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install -y -V subversion
- name: Create
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
RELEASE_DEFAULT=0 dev/release/release_rc.sh 100
- uses: actions/cache@v5
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-release-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-release-
- name: Verify
run: |
version=$(grep -o '^version = ".*"' "Project.toml" | \
sed -e 's/^version = "//g' \
-e 's/"$//g')
VERIFY_DEFAULT=0 dev/release/verify_rc.sh ${version} 100
test:
name: ${{ matrix.pkg.name }} - Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.nthreads }} threads - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
pkg:
- name: Arrow.jl
dir: '.'
- name: ArrowTypes.jl
dir: './src/ArrowTypes'
version:
- 'min'
- 'lts'
- '1' # automatically expands to the latest stable 1.x release of Julia
- 'pre' # expands to latest alpha, beta or RC, if available, otherwise same as `1`
os:
- ubuntu-latest
- macos-latest
nthreads:
- 1
- 2
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: ${{ matrix.version }}
- uses: actions/cache@v5
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@v1.6
with:
project: ${{ matrix.pkg.dir }}
- name: Dev local ArrowTypes for Arrow.jl tests
if: matrix.pkg.name == 'Arrow.jl'
shell: julia --project=. {0}
run: |
using Pkg
Pkg.develop(PackageSpec(path="src/ArrowTypes"))
- uses: julia-actions/julia-runtest@v1
env:
JULIA_NUM_THREADS: ${{ matrix.nthreads }}
with:
project: ${{ matrix.pkg.dir }}
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v6
with:
files: lcov.info
test_monorepo:
name: Monorepo dev - Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
fail-fast: true
matrix:
version:
- '1' # automatically expands to the latest stable 1.x release of Julia
os:
- ubuntu-latest
arch:
- x64
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v5
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- name: Dev monorepo dependencies
shell: julia --project=monorepo {0}
run: |
using Pkg;
Pkg.develop([PackageSpec(path="."), PackageSpec(path="src/ArrowTypes")])
- name: Run monorepo tests
continue-on-error: false
run: >
julia --color=yes --project=monorepo -e 'using Pkg; Pkg.test("Arrow")'
docs:
name: Documentation
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v6
- uses: julia-actions/julia-buildpkg@latest
- uses: julia-actions/julia-docdeploy@latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
Format:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: julia-actions/setup-julia@v3
- uses: actions/checkout@v6
- name: Install JuliaFormatter and format
# This will use the latest version by default but you can set the version like so:
# julia -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter", version="0.13.0"))'
run: |
julia -e 'using Pkg; Pkg.add(PackageSpec(name="JuliaFormatter"))'
julia -e 'using JuliaFormatter; format(".", verbose=true)'
- name: Format check
run: |
julia -e '
out = Cmd(`git diff --name-only`) |> read |> String
if out == ""
exit(0)
else
@error "Some files have not been formatted !!!"
write(stdout, out)
exit(1)
end'
================================================
FILE: .github/workflows/ci_nightly.yml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name: Nightly
on:
schedule:
- cron: 0 0 * * 1 # run once a week on Mondays
workflow_dispatch:
jobs:
test:
name: ${{ matrix.pkg.name }} - Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ matrix.nthreads }} threads - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
pkg:
- name: Arrow.jl
dir: '.'
- name: ArrowTypes.jl
dir: './src/ArrowTypes'
version:
- 'nightly'
os:
- ubuntu-latest
arch:
- x64
nthreads: [1, 2]
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v5
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@v1.6
with:
project: ${{ matrix.pkg.dir }}
- uses: julia-actions/julia-runtest@v1
env:
JULIA_NUM_THREADS: ${{ matrix.nthreads }}
with:
project: ${{ matrix.pkg.dir }}
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v6
with:
files: lcov.info
test_monorepo:
name: Monorepo dev - Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 30
strategy:
fail-fast: true
matrix:
version:
- 'nightly'
os:
- ubuntu-latest
arch:
- x64
steps:
- uses: actions/checkout@v6
- uses: julia-actions/setup-julia@v3
with:
version: ${{ matrix.version }}
arch: ${{ matrix.arch }}
- uses: actions/cache@v5
env:
cache-name: cache-artifacts
with:
path: ~/.julia/artifacts
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
restore-keys: |
${{ runner.os }}-test-${{ env.cache-name }}-
${{ runner.os }}-test-
${{ runner.os }}-
- name: Dev monorepo dependencies
shell: julia --project=monorepo {0}
run: |
using Pkg;
Pkg.develop([PackageSpec(path="."), PackageSpec(path="src/ArrowTypes")])
- name: Run monorepo tests
continue-on-error: false
run: >
julia --color=yes --project=monorepo -e 'using Pkg; Pkg.test("Arrow")'
================================================
FILE: .gitignore
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
Manifest.toml
Manifest-v*.toml
.DS_STORE
*.jl.cov
*.jl.*.cov
*.jl.mem
test/_scrap.jl
.DS_STORE
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: NOTICE
================================================
Apache Arrow Julia
Copyright 2016-2025 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (https://www.apache.org/).
================================================
FILE: Project.toml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name = "Arrow"
uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
authors = ["quinnj <quinn.jacobd@gmail.com>"]
version = "2.8.1"
[deps]
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
ConcurrentUtilities = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[compat]
ArrowTypes = "1.1,2"
BitIntegers = "0.2, 0.3"
CodecLz4 = "0.4"
CodecZstd = "0.7, 0.8"
ConcurrentUtilities = "2"
DataAPI = "1"
EnumX = "1"
PooledArrays = "0.5, 1.0"
SentinelArrays = "1"
StringViews = "1"
Tables = "1.1"
TimeZones = "1"
TranscodingStreams = "0.9.12, 0.10, 0.11"
julia = "1.9"
================================================
FILE: README.md
================================================
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Arrow
[](https://arrow.apache.org/julia/)
[](https://github.com/apache/arrow-julia/actions?query=workflow%3ACI)
[](https://app.codecov.io/gh/apache/arrow-julia)
[](https://juliahub.com/ui/Packages/Arrow/QnF3w?t=2)
[](https://juliahub.com/ui/Packages/Arrow/QnF3w)
[](https://juliahub.com/ui/Packages/Arrow/QnF3w)
This is a pure Julia implementation of the [Apache Arrow](https://arrow.apache.org) data standard. This package provides Julia `AbstractVector` objects for
referencing data that conforms to the Arrow standard. This allows users to seamlessly interface Arrow formatted data with a great deal of existing Julia code.
Please see this [document](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) for a description of the Arrow memory layout.
## Installation
The package can be installed by typing in the following in a Julia REPL:
```julia
julia> using Pkg; Pkg.add("Arrow")
```
## Local Development
When developing on Arrow.jl it is recommended that you run the following to ensure that any
changes to ArrowTypes.jl are immediately available to Arrow.jl without requiring a release:
```sh
julia --project -e 'using Pkg; Pkg.develop(path="src/ArrowTypes")'
```
## Format Support
This implementation supports the 1.0 version of the specification, including support for:
* All primitive data types
* All nested data types
* Dictionary encodings and messages
* Extension types
* Streaming, file, record batch, and replacement and isdelta dictionary messages
It currently doesn't include support for:
* Tensors or sparse tensors
* Flight RPC
* C data interface
Third-party data formats:
* CSV, parquet and avro support via the existing [CSV.jl](https://github.com/JuliaData/CSV.jl), [Parquet.jl](https://github.com/JuliaIO/Parquet.jl) and [Avro.jl](https://github.com/JuliaData/Avro.jl) packages
* Other Tables.jl-compatible packages automatically supported ([DataFrames.jl](https://github.com/JuliaData/DataFrames.jl), [JSONTables.jl](https://github.com/JuliaData/JSONTables.jl), [JuliaDB.jl](https://github.com/JuliaData/JuliaDB.jl), [SQLite.jl](https://github.com/JuliaDatabases/SQLite.jl), [MySQL.jl](https://github.com/JuliaDatabases/MySQL.jl), [JDBC.jl](https://github.com/JuliaDatabases/JDBC.jl), [ODBC.jl](https://github.com/JuliaDatabases/ODBC.jl), [XLSX.jl](https://github.com/felipenoris/XLSX.jl), etc.)
* No current Julia packages support ORC
See the [full documentation](https://arrow.apache.org/julia/) for details on reading and writing arrow data.
================================================
FILE: codecov.yaml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
---
codecov:
notify:
# Wait for all "test" matrix jobs
after_n_builds: 6
================================================
FILE: dev/release/.dir-locals.el
================================================
;;; Licensed to the Apache Software Foundation (ASF) under one
;;; or more contributor license agreements. See the NOTICE file
;;; distributed with this work for additional information
;;; regarding copyright ownership. The ASF licenses this file
;;; to you under the Apache License, Version 2.0 (the
;;; "License"); you may not use this file except in compliance
;;; with the License. You may obtain a copy of the License at
;;;
;;; http://www.apache.org/licenses/LICENSE-2.0
;;;
;;; Unless required by applicable law or agreed to in writing,
;;; software distributed under the License is distributed on an
;;; "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
;;; KIND, either express or implied. See the License for the
;;; specific language governing permissions and limitations
;;; under the License.
((sh-mode . ((indent-tabs-mode . nil)
(sh-basic-offset . 2))))
================================================
FILE: dev/release/.gitignore
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
/apache-rat-*.jar
/dist/
/filtered_rat.txt
/rat.xml
================================================
FILE: dev/release/README.md
================================================
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Release
## Overview
1. Test the revision to be released
2. Increment version number in `Project.toml`
3. Prepare RC and vote (detailed later)
4. Publish (detailed later)
### Prepare RC and vote
Run `dev/release/release_rc.sh` on working copy of `git@github.com:apache/arrow-julia` not your fork:
```console
$ git clone git@github.com:apache/arrow-julia.git
$ dev/release/release_rc.sh ${RC}
(Send a vote email to dev@arrow.apache.org.
You can use a draft shown by release_rc.sh for the email.)
```
Here is an example to release RC1:
```console
$ dev/release/release_rc.sh 1
```
The argument of `release_rc.sh` is the RC number. If RC1 has a problem, we'll increment the RC number such as RC2, RC3 and so on.
Requirements to run `release_rc.sh`:
* You must be an Apache Arrow committer or PMC member
* You must prepare your PGP key for signing
If you don't have a PGP key, https://infra.apache.org/release-signing.html#generate may be helpful.
Your PGP key must be registered to the followings:
* https://dist.apache.org/repos/dist/dev/arrow/KEYS
* https://dist.apache.org/repos/dist/release/arrow/KEYS
See the header comment of them how to add a PGP key.
Apache arrow committers can update them by Subversion client with their ASF account. e.g.:
```console
$ svn co https://dist.apache.org/repos/dist/dev/arrow
$ cd arrow
$ editor KEYS
$ svn ci KEYS
```
### Publish
We need to do the followings to publish a new release:
* Publish to apache.org
* Publish to the Julia General registry
Run `dev/release/release.sh` to publish to apache.org:
```console
$ dev/release/release.sh ${VERSION} ${RC}
```
Here is an example to release 2.2.1 RC1:
```console
$ dev/release/release.sh 2.2.1 1
```
Add the release to ASF's report database via [Apache Committee Report Helper](https://reporter.apache.org/addrelease.html?arrow).
To publish the release to the Julia General registry, navigate to the GitHub commit where the project version was incremented in the Project.toml file (step 2 above), then post a comment on the commit with the following:
```markdown
@JuliaRegistrator register
```
JuliaRegistrator will respond saying it has opened a pull request to the General registry and under normal circumstances, will be merged automatically.
If ArrowTypes is also registered, we also need to post a comment on the commit with the following:
```markdown
@JuliaRegistrator register subdir=src/ArrowTypes
```
### Verify
We have a script to verify a RC.
You must install the following commands to use the script:
* `curl`
* `gpg`
* `shasum` or `sha256sum`/`sha512sum`
* `tar`
You don't need to install Julia. If there isn't Julia in system, the latest Julia is automatically installed only for verification.
To verify a RC, run the following command line:
```console
$ dev/release/verify_rc.sh ${VERSION} ${RC}
```
Here is an example to release 2.2.1 RC1:
```console
$ dev/release/verify_rc.sh 2.2.1 1
```
If the verification is succeeded, `RC looks good!` is shown.
================================================
FILE: dev/release/check_rat_report.py
================================================
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import fnmatch
import re
import sys
import xml.etree.ElementTree as ET
if len(sys.argv) != 3:
sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" %
sys.argv[0])
sys.exit(1)
exclude_globs_filename = sys.argv[1]
xml_filename = sys.argv[2]
globs = [line.strip() for line in open(exclude_globs_filename, "r")]
tree = ET.parse(xml_filename)
root = tree.getroot()
resources = root.findall('resource')
all_ok = True
for r in resources:
approvals = r.findall('license-approval')
if not approvals or approvals[0].attrib['name'] == 'true':
continue
clean_name = re.sub('^[^/]+/', '', r.attrib['name'])
excluded = False
for g in globs:
if fnmatch.fnmatch(clean_name, g):
excluded = True
break
if not excluded:
sys.stdout.write("NOT APPROVED: %s (%s): %s\n" % (
clean_name, r.attrib['name'], approvals[0].attrib['name']))
all_ok = False
if not all_ok:
sys.exit(1)
print('OK')
sys.exit(0)
================================================
FILE: dev/release/rat_exclude_files.txt
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
Manifest.toml
dev/release/apache-rat-*.jar
dev/release/filtered_rat.txt
dev/release/rat.xml
test/arrowjson/*.json
================================================
FILE: dev/release/release.sh
================================================
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
set -eu
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <version> <rc>"
echo " e.g.: $0 2.2.1 1"
exit 1
fi
version=$1
rc=$2
rc_id="apache-arrow-julia-${version}-rc${rc}"
release_id="arrow-julia-${version}"
echo "Move from dev/ to release/"
svn \
mv \
-m "Apache Arrow Julia ${version}" \
https://dist.apache.org/repos/dist/dev/arrow/${rc_id} \
https://dist.apache.org/repos/dist/release/arrow/${release_id}
echo "Remove all RCs"
old_rcs=$(
svn ls https://dist.apache.org/repos/dist/dev/arrow/ | \
grep -E '^apache-arrow-julia-' | \
sort --version-sort --reverse
)
for old_rc in $old_rcs; do
echo "Remove RC ${old_rc}"
svn \
delete \
-m "Remove old Apache Arrow Julia RC: ${old_rc}" \
https://dist.apache.org/repos/dist/dev/arrow/${old_rc}
done
echo "Keep only the latest versions"
old_releases=$(
svn ls https://dist.apache.org/repos/dist/release/arrow/ | \
grep -E '^arrow-julia-' | \
sort --version-sort --reverse | \
tail -n +2
)
for old_release_version in $old_releases; do
echo "Remove old release ${old_release_version}"
svn \
delete \
-m "Remove old Apache Arrow Julia release: ${old_release_version}" \
https://dist.apache.org/repos/dist/release/arrow/${old_release_version}
done
echo "Success! The release is available here:"
echo " https://dist.apache.org/repos/dist/release/arrow/${release_id}"
echo
echo "Add this release to ASF's report database:"
echo " https://reporter.apache.org/addrelease.html?arrow"
================================================
FILE: dev/release/release_rc.sh
================================================
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
set -eu
SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)"
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <rc>"
echo " e.g.: $0 1"
exit 1
fi
rc=$1
: ${RELEASE_DEFAULT:=1}
: ${RELEASE_CLEANUP:=${RELEASE_DEFAULT}}
: ${RELEASE_PULL:=${RELEASE_DEFAULT}}
: ${RELEASE_PUSH_TAG:=${RELEASE_DEFAULT}}
: ${RELEASE_SIGN:=${RELEASE_DEFAULT}}
: ${RELEASE_UPLOAD:=${RELEASE_DEFAULT}}
cd "${SOURCE_TOP_DIR}"
if [ ${RELEASE_PULL} -gt 0 -o ${RELEASE_PUSH_TAG} -gt 0 ]; then
git_origin_url="$(git remote get-url origin)"
if [ "${git_origin_url}" != "git@github.com:apache/arrow-julia.git" ]; then
echo "This script must be ran with working copy of apache/arrow-julia."
echo "The origin's URL: ${git_origin_url}"
exit 1
fi
fi
if [ ${RELEASE_PULL} -gt 0 ]; then
echo "Ensure using the latest commit"
git checkout main
git pull --rebase --prune
fi
version=$(grep -o '^version = ".*"' "Project.toml" | \
sed -e 's/^version = "//' \
-e 's/"$//')
rc_tag="v${version}-rc${rc}"
echo "Tagging for RC: ${rc_tag}"
git tag -a -m "${version} RC${rc}" "${rc_tag}"
if [ ${RELEASE_PUSH_TAG} -gt 0 ]; then
git push origin "${rc_tag}"
fi
rc_hash="$(git rev-list --max-count=1 "${rc_tag}")"
id="apache-arrow-julia-${version}"
rc_id="${id}-rc${rc}"
dev_dist_url="https://dist.apache.org/repos/dist/dev/arrow"
dev_dist_dir="dev/release/dist"
tar_gz="${id}.tar.gz"
tar_gz_path="${dev_dist_dir}/${rc_id}/${tar_gz}"
rc_url="${dev_dist_url}/${rc_id}/"
echo "Checking out ${dev_dist_url}"
rm -rf "${dev_dist_dir}"
svn co --depth=empty "${dev_dist_url}" "${dev_dist_dir}"
echo "Attempting to create ${tar_gz} from tag ${rc_tag}"
mkdir -p "$(dirname "${tar_gz_path}")"
git archive "${rc_hash}" --prefix "${id}/" --output "${tar_gz_path}"
pushd "${dev_dist_dir}/${rc_id}"
echo "Running Rat license checker on ${tar_gz}"
../../run_rat.sh ${tar_gz}
if [ ${RELEASE_SIGN} -gt 0 ]; then
echo "Signing tar.gz and creating checksums"
gpg --armor --output ${tar_gz}.asc --detach-sig ${tar_gz}
fi
if type shasum >/dev/null 2>&1; then
sha256_generate="shasum -a 256"
sha512_generate="shasum -a 512"
else
sha256_generate="sha256sum"
sha512_generate="sha512sum"
fi
${sha256_generate} ${tar_gz} > ${tar_gz}.sha256
${sha512_generate} ${tar_gz} > ${tar_gz}.sha512
if [ ${RELEASE_UPLOAD} -gt 0 ]; then
echo "Uploading to ${rc_url}"
svn add .
svn ci -m "Apache Arrow Julia ${version} ${rc}"
fi
popd
if [ ${RELEASE_CLEANUP} -gt 0 ]; then
echo "Removing temporary directory"
rm -rf "${dev_dist_dir}"
fi
echo "Draft email for dev@arrow.apache.org mailing list"
echo ""
echo "---------------------------------------------------------"
cat <<MAIL
To: dev@arrow.apache.org
Subject: [VOTE][Julia] Release Apache Arrow Julia ${version} RC${rc}
Hi,
I would like to propose the following release candidate (RC${rc}) of
Apache Arrow Julia version ${version}.
This release candidate is based on commit:
${rc_hash} [1]
The source release rc${rc} is hosted at [2].
Please download, verify checksums and signatures, run the unit tests,
and vote on the release. See [3] for how to validate a release candidate.
The vote will be open for at least 24 hours.
[ ] +1 Release this as Apache Arrow Julia ${version}
[ ] +0
[ ] -1 Do not release this as Apache Arrow Julia ${version} because...
[1]: https://github.com/apache/arrow-julia/tree/${rc_hash}
[2]: ${rc_url}
[3]: https://github.com/apache/arrow-julia/blob/main/dev/release/README.md#verify
MAIL
echo "---------------------------------------------------------"
================================================
FILE: dev/release/run_rat.sh
================================================
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
set -eu
RELEASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
RAT_VERSION=0.13
RAT_JAR="${RELEASE_DIR}/apache-rat-${RAT_VERSION}.jar"
if [ ! -f "${RAT_JAR}" ]; then
curl \
--fail \
--output "${RAT_JAR}" \
--show-error \
--silent \
https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar
fi
RAT="java -jar ${RAT_JAR} -x "
RAT_XML="${RELEASE_DIR}/rat.xml"
$RAT $1 > "${RAT_XML}"
FILTERED_RAT_TXT="${RELEASE_DIR}/filtered_rat.txt"
if ${PYTHON:-python3} \
"${RELEASE_DIR}/check_rat_report.py" \
"${RELEASE_DIR}/rat_exclude_files.txt" \
"${RAT_XML}" > \
"${FILTERED_RAT_TXT}"; then
echo "No unapproved licenses"
else
cat "${FILTERED_RAT_TXT}"
N_UNAPPROVED=$(grep "NOT APPROVED" "${FILTERED_RAT_TXT}" | wc -l)
echo "${N_UNAPPROVED} unapproved licenses. Check Rat report: ${RAT_XML}"
exit 1
fi
================================================
FILE: dev/release/verify_rc.sh
================================================
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
set -eu
SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
TOP_SOURCE_DIR="$(dirname $(dirname ${SOURCE_DIR}))"
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <version> <rc>"
echo " e.g.: $0 2.2.1 1"
exit 1
fi
set -o pipefail
set -x
VERSION="$1"
RC="$2"
ARROW_DIST_URL="https://dist.apache.org/repos/dist/dev/arrow"
ARCHIVE_BASE_NAME="apache-arrow-julia-${VERSION}"
: ${VERIFY_DEFAULT:=1}
: ${VERIFY_DOWNLOAD:=${VERIFY_DEFAULT}}
: ${VERIFY_FORCE_USE_JULIA_BINARY:=0}
: ${VERIFY_SIGN:=${VERIFY_DEFAULT}}
download_dist_file() {
curl \
--fail \
--location \
--remote-name \
--show-error \
--silent \
"${ARROW_DIST_URL}/$1"
}
download_rc_file() {
local path="apache-arrow-julia-${VERSION}-rc${RC}/$1"
if [ ${VERIFY_DOWNLOAD} -gt 0 ]; then
download_dist_file "${path}"
else
cp "${SOURCE_DIR}/dist/${path}" "$1"
fi
}
import_gpg_keys() {
if [ ${VERIFY_SIGN} -gt 0 ]; then
download_dist_file KEYS
gpg --import KEYS
fi
}
if type shasum >/dev/null 2>&1; then
sha256_verify="shasum -a 256 -c"
sha512_verify="shasum -a 512 -c"
else
sha256_verify="sha256sum -c"
sha512_verify="sha512sum -c"
fi
fetch_archive() {
download_rc_file ${ARCHIVE_BASE_NAME}.tar.gz
if [ ${VERIFY_SIGN} -gt 0 ]; then
download_rc_file ${ARCHIVE_BASE_NAME}.tar.gz.asc
gpg --verify ${ARCHIVE_BASE_NAME}.tar.gz.asc ${ARCHIVE_BASE_NAME}.tar.gz
fi
download_rc_file ${ARCHIVE_BASE_NAME}.tar.gz.sha256
${sha256_verify} ${ARCHIVE_BASE_NAME}.tar.gz.sha256
download_rc_file ${ARCHIVE_BASE_NAME}.tar.gz.sha512
${sha512_verify} ${ARCHIVE_BASE_NAME}.tar.gz.sha512
}
setup_tmpdir() {
cleanup() {
if [ "${VERIFY_SUCCESS}" = "yes" ]; then
rm -rf "${VERIFY_TMPDIR}"
else
echo "Failed to verify release candidate. See ${VERIFY_TMPDIR} for details."
fi
}
if [ -z "${VERIFY_TMPDIR:-}" ]; then
VERIFY_TMPDIR=$(mktemp -d -t "$1.XXXXX")
trap cleanup EXIT
else
mkdir -p "${VERIFY_TMPDIR}"
fi
}
latest_julia_version() {
curl \
--fail \
--location \
--show-error \
--silent \
https://api.github.com/repos/JuliaLang/julia/releases/latest | \
grep -o '"tag_name": "v.*"' | \
head -n 1 | \
sed -e 's/^"tag_name": "v//g' \
-e 's/"$//g'
}
ensure_julia() {
if [ ${VERIFY_FORCE_USE_JULIA_BINARY} -le 0 ]; then
if julia --version; then
return
fi
fi
local julia_binary_url=https://julialang-s3.julialang.org/bin
local julia_version=$(latest_julia_version)
local julia_version_series=${julia_version%.*}
case "$(uname)" in
Darwin)
julia_binary_url+="/mac"
case "$(arch)" in
arm64)
julia_binary_url+="/aarch64"
julia_binary_url+="/${julia_version_series}"
julia_binary_url+="/julia-${julia_version}-macaarch64.tar.gz"
;;
i386)
julia_binary_url+="/x64"
julia_binary_url+="/${julia_version_series}"
julia_binary_url+="/julia-${julia_version}-mac64.tar.gz"
;;
*)
echo "You must install Julia manually on $(uname) $(arch)"
;;
esac
;;
Linux)
julia_binary_url+="/linux"
case "$(arch)" in
aarch64)
julia_binary_url+="/aarch64"
;;
x86_64)
julia_binary_url+="/x64"
;;
*)
echo "You must install Julia manually on $(uname) $(arch)"
;;
esac
julia_binary_url+="/${julia_version_series}"
julia_binary_url+="/julia-${julia_version}-linux-$(arch).tar.gz"
;;
*)
echo "You must install Julia manually on $(uname)"
exit 1
;;
esac
julia_binary_tar_gz=$(basename ${julia_binary_url})
curl \
--fail \
--location \
--output ${julia_binary_tar_gz} \
--show-error \
--silent \
${julia_binary_url}
tar xf ${julia_binary_tar_gz}
julia_path=$(echo julia-*/bin/julia)
PATH="$(pwd)/$(dirname ${julia_path}):${PATH}"
export JULIA_DEPOT_PATH="$(pwd)/.julia"
}
test_source_distribution() {
pushd src/ArrowTypes
julia --project -e 'import Pkg; Pkg.build(); Pkg.test()'
popd
# Dev local ArrowTypes to use the version from this release, not from registry
julia --project -e 'import Pkg; Pkg.develop(path="src/ArrowTypes"); Pkg.build(); Pkg.test()'
}
VERIFY_SUCCESS=no
setup_tmpdir "arrow-julia-${VERSION}-${RC}"
echo "Working in sandbox ${VERIFY_TMPDIR}"
cd "${VERIFY_TMPDIR}"
import_gpg_keys
fetch_archive
tar xf ${ARCHIVE_BASE_NAME}.tar.gz
ensure_julia
pushd ${ARCHIVE_BASE_NAME}
test_source_distribution
popd
VERIFY_SUCCESS=yes
echo "RC looks good!"
================================================
FILE: docs/.gitignore
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
build/
site/
================================================
FILE: docs/Project.toml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
[compat]
Documenter = "1"
================================================
FILE: docs/make.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
using Documenter
using Arrow
makedocs(;
modules=[Arrow],
repo=Remotes.GitHub("apache", "arrow-julia"),
sitename="Arrow.jl",
format=Documenter.HTML(;
prettyurls=get(ENV, "CI", "false") == "true",
canonical="https://arrow.apache.org/julia/",
assets=String[],
),
pages=[
"Home" => "index.md",
"User Manual" => "manual.md",
"API Reference" => "reference.md",
],
)
deploydocs(; repo="github.com/apache/arrow-julia", devbranch="main", branch="asf-site")
================================================
FILE: docs/src/index.md
================================================
```@raw html
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
```
# Arrow.jl
```@contents
Pages = ["manual.md", "reference.md"]
Depth = 3
```
```@docs
Arrow
```
================================================
FILE: docs/src/manual.md
================================================
```@raw html
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
```
# User Manual
The goal of this documentation is to provide a brief introduction to the arrow data format, then provide a walk-through of the functionality provided in the Arrow.jl Julia package, with an aim to expose a little of the machinery "under the hood" to help explain how things work and how that influences real-world use-cases for the arrow data format.
The best place to learn about the Apache arrow project is [the website itself](https://arrow.apache.org/), specifically the data format [specification](https://arrow.apache.org/docs/format/Columnar.html). Put briefly, the arrow project provides a formal specification for how columnar, "table" data can be laid out efficiently in memory to standardize and maximize the ability to share data across languages/platforms. In the current [apache/arrow GitHub repository](https://github.com/apache/arrow), language implementations exist for C++, Java, Go, Javascript, Rust, to name a few. Other database vendors and data processing frameworks/applications have also built support for the arrow format, allowing for a wide breadth of possibility for applications to "speak the data language" of arrow.
The [Arrow.jl](https://github.com/apache/arrow-julia) Julia package is another implementation, allowing the ability to both read and write data in the arrow format. As a data format, arrow specifies an exact memory layout to be used for columnar table data, and as such, "reading" involves custom Julia objects ([`Arrow.Table`](@ref) and [`Arrow.Stream`](@ref)), which read the *metadata* of an "arrow memory blob", then *wrap* the array data contained therein, having learned the type and size, amongst other properties, from the metadata. Let's take a closer look at what this "reading" of arrow memory really means/looks like.
## Support for generic path-like types
Arrow.jl attempts to support any path-like type wherever a function takes a path as an argument. The Arrow.jl API should generically work as long as the type supports:
- `Base.open(path, mode)::I where I <: IO`
When a custom `IO` subtype is returned (`I`) then the following methods also need to be defined:
- `Base.read(io::I, ::Type{UInt8})` or `Base.read(io::I)`
- `Base.write(io::I, x)`
## Reading arrow data
After installing the Arrow.jl Julia package (via `] add Arrow`), and if you have some arrow data, let's say a file named `data.arrow` generated from the [`pyarrow`](https://arrow.apache.org/docs/python/) library (a Python library for interfacing with arrow data), you can then read that arrow data into a Julia session by doing:
```julia
using Arrow
table = Arrow.Table("data.arrow")
```
### `Arrow.Table`
The type of `table` in this example will be an `Arrow.Table`. When "reading" the arrow data, `Arrow.Table` first ["mmapped"](https://en.wikipedia.org/wiki/Mmap) the `data.arrow` file, which is an important technique for dealing with data larger than available RAM on a system. By "mmapping" a file, the OS doesn't actually load the entire file contents into RAM at the same time, but file contents are "swapped" into RAM as different regions of a file are requested. Once "mmapped", `Arrow.Table` then inspected the metadata in the file to determine the number of columns, their names and types, at which byte offset each column begins in the file data, and even how many "batches" are included in this file (arrow tables may be partitioned into one or more "record batches" each containing portions of the data). Armed with all the appropriate metadata, `Arrow.Table` then created custom array objects ([`Arrow.ArrowVector`](@ref)), which act as "views" into the raw arrow memory bytes. This is a significant point in that no extra memory is allocated for "data" when reading arrow data. This is in contrast to if we wanted to read data from a csv file as columns into Julia structures; we would need to allocate those array structures ourselves, then parse the file, "filling in" each element of the array with the data we parsed from the file. Arrow data, on the other hand, is *already laid out in memory or on disk* in a binary format, and as long as we have the metadata to interpret the raw bytes, we can figure out whether to treat those bytes as a `Vector{Float64}`, etc. A sample of the kinds of arrow array types you might see when deserializing arrow data, include:
* [`Arrow.Primitive`](@ref): the most common array type for simple, fixed-size elements like integers, floats, time types, and decimals
* [`Arrow.List`](@ref): an array type where its own elements are also arrays of some kind, like string columns, where each element can be thought of as an array of characters
* [`Arrow.FixedSizeList`](@ref): similar to the `List` type, but where each array element has a fixed number of elements itself; you can think of this like a `Vector{NTuple{N, T}}`, where `N` is the fixed-size width
* [`Arrow.Map`](@ref): an array type where each element is like a Julia `Dict`; a list of key value pairs like a `Vector{Dict}`
* [`Arrow.Struct`](@ref): an array type where each element is an instance of a custom struct, i.e. an ordered collection of named & typed fields, kind of like a `Vector{NamedTuple}`
* [`Arrow.DenseUnion`](@ref): an array type where elements may be of several different types, stored compactly; can be thought of like `Vector{Union{A, B}}`
* [`Arrow.SparseUnion`](@ref): another array type where elements may be of several different types, but stored as if made up of identically lengthed child arrays for each possible type (less memory efficient than `DenseUnion`)
* [`Arrow.DictEncoded`](@ref): a special array type where values are "dictionary encoded", meaning the list of unique, possible values for an array are stored internally in an "encoding pool", whereas each stored element of the array is just an integer "code" to index into the encoding pool for the actual value.
And while these custom array types do subtype `AbstractArray`, there is no current support for `setindex!`. Remember, these arrays are "views" into the raw arrow bytes, so for array types other than `Arrow.Primitive`, it gets pretty tricky to allow manipulating those raw arrow bytes. Nevetheless, it's as simple as calling `copy(x)` where `x` is any `ArrowVector` type, and a normal Julia `Vector` type will be fully materialized (which would then allow mutating/manipulating values).
So, what can you do with an `Arrow.Table` full of data? Quite a bit actually!
Because `Arrow.Table` implements the [Tables.jl](https://juliadata.github.io/Tables.jl/stable/) interface, it opens up a world of integrations for using arrow data. A few examples include:
* `df = DataFrame(Arrow.Table(file))`: Build a [`DataFrame`](https://juliadata.github.io/DataFrames.jl/stable/), using the arrow vectors themselves; this allows utilizing a host of DataFrames.jl functionality directly on arrow data; grouping, joining, selecting, etc.
* `df = copy(DataFrame(Arrow.Table(file)))`: Build a [`DataFrame`](https://juliadata.github.io/DataFrames.jl/stable/), where the columns are regular in-memory vectors (specifically, `Base.Vector`s and/or `PooledVector`s). This requires that you have enough memory to load the entire `DataFrame` into memory.
* `Tables.datavaluerows(Arrow.Table(file)) |> @map(...) |> @filter(...) |> DataFrame`: use [`Query.jl`'s](https://www.queryverse.org/Query.jl/stable/standalonequerycommands/) row-processing utilities to map, group, filter, mutate, etc. directly over arrow data.
* `Arrow.Table(file) |> SQLite.load!(db, "arrow_table")`: load arrow data directly into an sqlite database/table, where sql queries can be executed on the data
* `Arrow.Table(file) |> CSV.write("arrow.csv")`: write arrow data out to a csv file
A full list of Julia packages leveraging the Tables.jl inteface can be found [here](https://github.com/JuliaData/Tables.jl/blob/master/INTEGRATIONS.md).
Apart from letting other packages have all the fun, an `Arrow.Table` itself can be plenty useful. For example, with `tbl = Arrow.Table(file)`:
* `tbl[1]`: retrieve the first column via indexing; the number of columns can be queried via `length(tbl)`
* `tbl[:col1]` or `tbl.col1`: retrieve the column named `col1`, either via indexing with the column name given as a `Symbol`, or via "dot-access"
* `for col in tbl`: iterate through columns in the table
* `AbstractDict` methods like `haskey(tbl, :col1)`, `get(tbl, :col1, nothing)`, `keys(tbl)`, or `values(tbl)`
### Arrow types
In the arrow data format, specific logical types are supported, a list of which can be found [here](https://arrow.apache.org/docs/status.html#data-types). These include booleans, integers of various bit widths, floats, decimals, time types, and binary/string. While most of these map naturally to types builtin to Julia itself, there are a few cases where the definitions are slightly different, and in these cases, by default, they are converted to more "friendly" Julia types (this auto conversion can be avoided by passing `convert=false` to `Arrow.Table`, like `Arrow.Table(file; convert=false)`). Examples of arrow to julia type mappings include:
* `Date`, `Time`, `Timestamp`, and `Duration` all have natural Julia defintions in `Dates.Date`, `Dates.Time`, `TimeZones.ZonedDateTime`, and `Dates.Period` subtypes, respectively.
* `Char` and `Symbol` Julia types are mapped to arrow string types, with additional metadata of the original Julia type; this allows deserializing directly to `Char` and `Symbol` in Julia, while other language implementations will see these columns as just strings
* Similarly to the above, the `UUID` Julia type is mapped to a 128-bit `FixedSizeBinary` arrow type.
* `Decimal128` and `Decimal256` have no corresponding builtin Julia types, so they're deserialized using a compatible type definition in Arrow.jl itself: `Arrow.Decimal`
Note that when `convert=false` is passed, data will be returned in Arrow.jl-defined types that exactly match the arrow definitions of those types; the authoritative source for how each type represents its data can be found in the arrow [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) file.
One note on performance: when writing `TimeZones.ZonedDateTime` columns to the arrow format (via `Arrow.write`), it is preferrable to "wrap" the columns in `Arrow.ToTimestamp(col)`, as long
as the column has `ZonedDateTime` elements that all share a common timezone. This ensures the writing process can know "upfront" which timezone will be encoded and is thus much more
efficient and performant.
#### Custom types
To support writing your custom Julia struct, Arrow.jl utilizes the format's mechanism for "extension types" by allowing the storing of Julia type name and metadata in the field metadata. To "hook in" to this machinery, custom types can utilize the interface methods defined in the `Arrow.ArrowTypes` submodule. For example:
```julia
using Arrow
struct Person
id::Int
name::String
end
# overload interface method for custom type Person; return a symbol as the "name"
# this instructs Arrow.write what "label" to include with a column with this custom type
const NAME = Symbol("JuliaLang.MyPackage.Person")
ArrowTypes.arrowname(::Type{Person}) = NAME
# overload JuliaType on `Val{NAME}`, which is like a dispatchable string
# return our custom *type* Person; this enables Arrow.Table to know how the "label"
# on a custom column should be mapped to a Julia type and deserialized
ArrowTypes.JuliaType(::Val{NAME}) = Person
table = (col1=[Person(1, "Bob"), Person(2, "Jane")],)
io = IOBuffer()
Arrow.write(io, table)
seekstart(io)
table2 = Arrow.Table(io)
```
In this example, we're writing our `table`, which is a NamedTuple with one column named `col1`, which has two
elements which are instances of our custom `Person` struct. We overload `Arrowtypes.arrowname` so that
Arrow.jl knows how to serialize our `Person` struct. We then overload `ArrowTypes.JuliaType` so the deserialization process knows how to map from our type label back to our `Person` struct type. We can then write our data in the arrow format to an in-memory `IOBuffer`, then read the table back in using `Arrow.Table`.
The table we get back will be an `Arrow.Table`, with a single `Arrow.Struct` column with element type `Person`.
Note that without calling `Arrowtypes.JuliaType`, we may get into a weird limbo state where we've written
our table with `Person` structs out as a table, but when reading back in, Arrow.jl doesn't know what a `Person` is;
deserialization won't fail, but we'll just get a `Namedtuple{(:id, :name), Tuple{Int, String}}` back instead of `Person`.
While this example is very simple, it shows the basics to allow a custom type to be serialized/deserialized. But the `ArrowTypes` module offers even more powerful functionality for "hooking" non-native arrow types into the serialization/deserialization processes. Let's walk through a couple more examples; if you've had enough custom type shenanigans, feel free to skip to the next section.
Let's take a look at how Arrow.jl allows serializing the `nothing` value, which is often referred to as the "software engineer's NULL" in Julia. While Arrow.jl treats `missing` as the default arrow NULL value, `nothing` is pretty similar, but we'd still like to treat it separately if possible. Here's how we enable serialization/deserialization in the `ArrowTypes` module:
```julia
ArrowTypes.ArrowKind(::Type{Nothing}) = ArrowTypes.NullKind()
ArrowTypes.ArrowType(::Type{Nothing}) = Missing
ArrowTypes.toarrow(::Nothing) = missing
const NOTHING = Symbol("JuliaLang.Nothing")
ArrowTypes.arrowname(::Type{Nothing}) = NOTHING
ArrowTypes.JuliaType(::Val{NOTHING}) = Nothing
ArrowTypes.fromarrow(::Type{Nothing}, ::Missing) = nothing
```
Let's walk through what's going on here, line-by-line:
* `ArrowKind` overload: `ArrowKind`s are generic "categories" of types supported by the arrow format, like `PrimitiveKind`, `ListKind`, etc. They each correspond to a different data layout strategy supported in the arrow format. Here, we define `nothing`'s kind to be `NullKind`, which means no actual memory is needed for storage, it's strictly a "metadata" type where we store the type and # of elements. In our `Person` example, we didn't need to overload this since types declared like `struct T` or `mutable struct T` are defined as `ArrowTypes.StructKind` by default
* `ArrowType` overload: here we're signaling that our type (`Nothing`) maps to the natively supported arrow type of `Missing`; this is important for the serializer so it knows which arrow type it will be serializing. Again, we didn't need to overload this for `Person` since the serializer knows how to serialize custom structs automatically by using reflection methods like `fieldnames(T)` and `getfield(x, i)`.
* `ArrowTypes.toarrow` overload: this is a sister method to `ArrowType`; we said our type will map to the `Missing` arrow type, so here we actually define ___how___ it converts to the arrow type; and in this case, it just returns `missing`. This is yet another method that didn't show up for `Person`; why? Well, as we noted in `ArrowType`, the serializer already knows how to serialize custom structs by using all their fields; if, for some reason, we wanted to omit some fields or otherwise transform things, then we could define corresponding `ArrowType` and `toarrow` methods
* `arrowname` overload: similar to our `Person` example, we need to instruct the serializer how to label our custom type in the arrow type metadata; here we give it the symbol `Symbol("JuliaLang.Nothing")`. Note that while this will ultimately allow us to disambiguate `nothing` from `missing` when reading arrow data, if we pass this data to other language implementations, they will only treat the data as `missing` since they (probably) won't know how to "understand" the `JuliaLang.Nothing` type label
* `JuliaType` overload: again, like our `Person` example, we instruct the deserializer that when it encounters the `JuliaLang.Nothing` type label, it should treat those values as `Nothing` type.
* And finally, `fromarrow` overload: this allows specifying how the native-arrow data should be converted back to our custom type. `fromarrow(T, x...)` by default will call `T(x...)`, which is why we didn't need this overload for `Person`, but in this example, `Nothing(missing)` won't work, so we define our own custom conversion.
Let's run through one more complex example, just for fun and to really see how far the system can be pushed:
```julia
using Intervals
table = (col = [
Interval{Closed,Unbounded}(1,nothing),
],)
const NAME = Symbol("JuliaLang.Intervals.Interval")
ArrowTypes.arrowname(::Type{Interval{T, L, R}}) where {T, L, R} = NAME
const LOOKUP = Dict(
"Closed" => Closed,
"Unbounded" => Unbounded
)
ArrowTypes.arrowmetadata(::Type{Interval{T, L, R}}) where {T, L, R} = string(L, ".", R)
function ArrowTypes.JuliaType(::Val{NAME}, ::Type{NamedTuple{names, types}}, meta) where {names, types}
L, R = split(meta, ".")
return Interval{fieldtype(types, 1), LOOKUP[L], LOOKUP[R]}
end
ArrowTypes.fromarrow(::Type{Interval{T, L, R}}, first, last) where {T, L, R} = Interval{L, R}(first, R == Unbounded ? nothing : last)
io = Arrow.tobuffer(table)
tbl = Arrow.Table(io)
```
Again, let's break down what's going on here:
* Here we're trying to save an `Interval` type in the arrow format; this type is unique in that it has two type parameters (`Closed` and `Unbounded`) that are not inferred/based on fields, but are just "type tags" on the type itself
* Note that we define a generic `arrowname` method on all `Interval`s, regardless of type parameters. We just want to let arrow know which general type we're dealing with here
* Next we use a new method `ArrowTypes.arrowmetadata` to encode the two non-field-based type parameters as a string with a dot delimiter; we encode this information here because remember, we have to match our `arrowname` Symbol typename in our `JuliaType(::Val(name))` definition in order to dispatch correctly; if we encoded the type parameters in `arrowname`, we would need separate `arrowname` definitions for each unique combination of those two type parameters, and corresponding `JuliaType` definitions for each as well; yuck. Instead, we let `arrowname` be generic to our type, and store the type parameters *for this specific column* using `arrowmetadata`
* Now in `JuliaType`, note we're using the 3-argument overload; we want the `NamedTuple` type that is the native arrow type our `Interval` is being serialized as; we use this to retrieve the 1st type parameter for our `Interval`, which is simply the type of the two `first` and `last` fields. Then we use the 3rd argument, which is whatever string we returned from `arrowmetadata`. We call `L, R = split(meta, ".")` to parse the two type parameters (in this case `Closed` and `Unbounded`), then do a lookup on those strings from a predefined `LOOKUP` Dict that matches the type parameter name as string to the actual type. We then have all the information to recreate the full `Interval` type. Neat!
* The one final wrinkle is in our `fromarrow` method; `Interval`s that are `Unbounded`, actually take `nothing` as the 2nd argument. So letting the default `fromarrow` definition call `Interval{T, L, R}(first, last)`, where `first` and `last` are both integers isn't going to work. Instead, we check if the `R` type parameter is `Unbounded` and if so, pass `nothing` as the 2nd arg, otherwise we can pass `last`.
This stuff can definitely make your eyes glaze over if you stare at it long enough. As always, don't hesitate to reach out for quick questions on the [#data](https://julialang.slack.com/messages/data/) slack channel, or [open a new issue](https://github.com/apache/arrow-julia/issues/new) detailing what you're trying to do.
### `Arrow.Stream`
In addition to `Arrow.Table`, the Arrow.jl package also provides `Arrow.Stream` for processing arrow data. While `Arrow.Table` will iterate all record batches in an arrow file/stream, concatenating columns, `Arrow.Stream` provides a way to *iterate* through record batches, one at a time. Each iteration yields an `Arrow.Table` instance, with columns/data for a single record batch. This allows, if so desired, "batch processing" of arrow data, one record batch at a time, instead of creating a single long table via `Arrow.Table`.
### Custom application metadata
The Arrow format allows data producers to [attach custom metadata](https://arrow.apache.org/docs/format/Columnar.html#custom-application-metadata) to various Arrow objects.
Arrow.jl provides a convenient accessor for this metadata via [`Arrow.getmetadata`](@ref). `Arrow.getmetadata(t::Arrow.Table)` will return an immutable `AbstractDict{String,String}` that represents the [`custom_metadata` of the table's associated `Schema`](https://github.com/apache/arrow/blob/85d8175ea24b4dd99f108a673e9b63996d4f88cc/format/Schema.fbs#L515) (or `nothing` if no such metadata exists), while `Arrow.getmetadata(c::Arrow.ArrowVector)` will return a similar representation of [the column's associated `Field` `custom_metadata`](https://github.com/apache/arrow/blob/85d8175ea24b4dd99f108a673e9b63996d4f88cc/format/Schema.fbs#L480) (or `nothing` if no such metadata exists).
To attach custom schema/column metadata to Arrow tables at serialization time, see the `metadata` and `colmetadata` keyword arguments to [`Arrow.write`](@ref).
## Writing arrow data
Ok, so that's a pretty good rundown of *reading* arrow data, but how do you *produce* arrow data? Enter `Arrow.write`.
### `Arrow.write`
With `Arrow.write`, you provide either an `io::IO` argument or a [`file_path`](#support-for-generic-path-like-types) to write the arrow data to, as well as a Tables.jl-compatible source that contains the data to be written.
What are some examples of Tables.jl-compatible sources? A few examples include:
* `Arrow.write(io, df::DataFrame)`: A `DataFrame` is a collection of indexable columns
* `Arrow.write(io, CSV.File(file))`: read data from a csv file and write out to arrow format
* `Arrow.write(io, DBInterface.execute(db, sql_query))`: Execute an SQL query against a database via the [`DBInterface.jl`](https://github.com/JuliaDatabases/DBInterface.jl) interface, and write the query resultset out directly in the arrow format. Packages that implement DBInterface include [SQLite.jl](https://juliadatabases.github.io/SQLite.jl/stable/), [MySQL.jl](https://juliadatabases.github.io/MySQL.jl/dev/), and [ODBC.jl](http://juliadatabases.github.io/ODBC.jl/latest/).
* `df |> @map(...) |> Arrow.write(io)`: Write the results of a [Query.jl](https://www.queryverse.org/Query.jl/stable/) chain of operations directly out as arrow data
* `jsontable(json) |> Arrow.write(io)`: Treat a json array of objects or object of arrays as a "table" and write it out as arrow data using the [JSONTables.jl](https://github.com/JuliaData/JSONTables.jl) package
* `Arrow.write(io, (col1=data1, col2=data2, ...))`: a `NamedTuple` of `AbstractVector`s or an `AbstractVector` of `NamedTuple`s are both considered tables by default, so they can be quickly constructed for easy writing of arrow data if you already have columns of data
And these are just a few examples of the numerous [integrations](https://github.com/JuliaData/Tables.jl/blob/master/INTEGRATIONS.md).
In addition to just writing out a single "table" of data as a single arrow record batch, `Arrow.write` also supports writing out multiple record batches when the input supports the `Tables.partitions` functionality. One immediate, though perhaps not incredibly useful example, is `Arrow.Stream`. `Arrow.Stream` implements `Tables.partitions` in that it iterates "tables" (specifically `Arrow.Table`), and as such, `Arrow.write` will iterate an `Arrow.Stream`, and write out each `Arrow.Table` as a separate record batch. Another important point for why this example works is because an `Arrow.Stream` iterates `Arrow.Table`s that all have the same schema. This is important because when writing arrow data, a "schema" message is always written first, with all subsequent record batches written with data matching the initial schema.
In addition to inputs that support `Tables.partitions`, note that the Tables.jl itself provides the `Tables.partitioner` function, which allows providing your own separate instances of similarly-schema-ed tables as "partitions", like:
```julia
# treat 2 separate NamedTuples of vectors with same schema as 1 table, 2 partitions
tbl_parts = Tables.partitioner([(col1=data1, col2=data2), (col1=data3, col2=data4)])
Arrow.write(io, tbl_parts)
# treat an array of csv files with same schema where each file is a partition
# in this form, a function `CSV.File` is applied to each element of 2nd argument
csv_parts = Tables.partitioner(CSV.File, csv_files)
Arrow.write(io, csv_parts)
```
### `Arrow.Writer`
With `Arrow.Writer`, you instantiate an `Arrow.Writer` object, write sources using it, and then close it. This allows for incrmental writes to the same sink. It is similar to `Arrow.append` without having to close and re-open the sink in between writes and without the limitation of only supporting the IPC stream format.
### Multithreaded writing
By default, `Arrow.write` will use multiple threads to write multiple
record batches simultaneously (e.g. if julia is started with `julia -t 8` or the `JULIA_NUM_THREADS` environment variable is set). The number of concurrent tasks to use when writing can be controlled by passing the `ntasks` keyword argument to `Arrow.write`. Passing `ntasks=1` avoids any multithreading when writing.
### Compression
Compression is supported when writing via the `compress` keyword argument. Possible values include `:lz4`, `:zstd`, or your own initialized `LZ4FrameCompressor` or `ZstdCompressor` objects; will cause all buffers in each record batch to use the respective compression encoding or compressor.
================================================
FILE: docs/src/reference.md
================================================
```@raw html
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
```
# API Reference
```@autodocs
Modules = [Arrow]
Order = [:type, :function]
```
## Internals: `Arrow.FlatBuffers`
The `FlatBuffers` module is not part of Arrow.jl's public API, and these functions may change without notice.
```@autodocs
Modules = [Arrow.FlatBuffers]
```
================================================
FILE: src/Arrow.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Arrow.jl
A pure Julia implementation of the [apache arrow](https://arrow.apache.org/) memory format specification.
This implementation supports the 1.0 version of the specification, including support for:
* All primitive data types
* All nested data types
* Dictionary encodings, nested dictionary encodings, and messages
* Extension types
* Streaming, file, record batch, and replacement and isdelta dictionary messages
* Buffer compression/decompression via the standard LZ4 frame and Zstd formats
It currently doesn't include support for:
* Tensors or sparse tensors
* Flight RPC
* C data interface
Third-party data formats:
* csv and parquet support via the existing [CSV.jl](https://github.com/JuliaData/CSV.jl) and [Parquet.jl](https://github.com/JuliaIO/Parquet.jl) packages
* Other [Tables.jl](https://github.com/JuliaData/Tables.jl)-compatible packages automatically supported ([DataFrames.jl](https://github.com/JuliaData/DataFrames.jl), [JSONTables.jl](https://github.com/JuliaData/JSONTables.jl), [JuliaDB.jl](https://github.com/JuliaData/JuliaDB.jl), [SQLite.jl](https://github.com/JuliaDatabases/SQLite.jl), [MySQL.jl](https://github.com/JuliaDatabases/MySQL.jl), [JDBC.jl](https://github.com/JuliaDatabases/JDBC.jl), [ODBC.jl](https://github.com/JuliaDatabases/ODBC.jl), [XLSX.jl](https://github.com/felipenoris/XLSX.jl), etc.)
* No current Julia packages support ORC or Avro data formats
See docs for official Arrow.jl API with the [User Manual](@ref) and reference docs for [`Arrow.Table`](@ref), [`Arrow.write`](@ref), and [`Arrow.Stream`](@ref).
"""
module Arrow
using Base.Iterators
using Mmap
import Dates
using DataAPI,
Tables,
SentinelArrays,
PooledArrays,
CodecLz4,
CodecZstd,
TimeZones,
BitIntegers,
ConcurrentUtilities,
StringViews
export ArrowTypes
using Base: @propagate_inbounds
import Base: ==
const FILE_FORMAT_MAGIC_BYTES = b"ARROW1"
const CONTINUATION_INDICATOR_BYTES = 0xffffffff
# vendored flatbuffers code for now
include("FlatBuffers/FlatBuffers.jl")
using .FlatBuffers
include("metadata/Flatbuf.jl")
using .Flatbuf
const Meta = Flatbuf
using ArrowTypes
include("utils.jl")
include("arraytypes/arraytypes.jl")
include("eltypes.jl")
include("table.jl")
include("write.jl")
include("append.jl")
include("show.jl")
const ZSTD_COMPRESSOR = Lockable{ZstdCompressor}[]
const ZSTD_DECOMPRESSOR = Lockable{ZstdDecompressor}[]
const LZ4_FRAME_COMPRESSOR = Lockable{LZ4FrameCompressor}[]
const LZ4_FRAME_DECOMPRESSOR = Lockable{LZ4FrameDecompressor}[]
function init_zstd_compressor()
zstd = ZstdCompressor(; level=3)
CodecZstd.TranscodingStreams.initialize(zstd)
return Lockable(zstd)
end
function init_zstd_decompressor()
zstd = ZstdDecompressor()
CodecZstd.TranscodingStreams.initialize(zstd)
return Lockable(zstd)
end
function init_lz4_frame_compressor()
lz4 = LZ4FrameCompressor(; compressionlevel=4)
CodecLz4.TranscodingStreams.initialize(lz4)
return Lockable(lz4)
end
function init_lz4_frame_decompressor()
lz4 = LZ4FrameDecompressor()
CodecLz4.TranscodingStreams.initialize(lz4)
return Lockable(lz4)
end
function access_threaded(f, v::Vector)
tid = Threads.threadid()
0 < tid <= length(v) || _length_assert()
if @inbounds isassigned(v, tid)
@inbounds x = v[tid]
else
x = f()
@inbounds v[tid] = x
end
return x
end
@noinline _length_assert() = @assert false "0 < tid <= v"
zstd_compressor() = access_threaded(init_zstd_compressor, ZSTD_COMPRESSOR)
zstd_decompressor() = access_threaded(init_zstd_decompressor, ZSTD_DECOMPRESSOR)
lz4_frame_compressor() = access_threaded(init_lz4_frame_compressor, LZ4_FRAME_COMPRESSOR)
lz4_frame_decompressor() =
access_threaded(init_lz4_frame_decompressor, LZ4_FRAME_DECOMPRESSOR)
function __init__()
nt = @static if isdefined(Base.Threads, :maxthreadid)
Threads.maxthreadid()
else
Threads.nthreads()
end
resize!(empty!(LZ4_FRAME_COMPRESSOR), nt)
resize!(empty!(ZSTD_COMPRESSOR), nt)
resize!(empty!(LZ4_FRAME_DECOMPRESSOR), nt)
resize!(empty!(ZSTD_DECOMPRESSOR), nt)
return
end
end # module Arrow
================================================
FILE: src/ArrowTypes/LICENSE.md
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: src/ArrowTypes/Project.toml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
name = "ArrowTypes"
uuid = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
authors = ["quinnj <quinn.jacobd@gmail.com>"]
version = "2.3.0"
[deps]
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[compat]
julia = "1.0"
================================================
FILE: src/ArrowTypes/src/ArrowTypes.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The ArrowTypes module provides the [`ArrowTypes.Arrowtype`](@ref) interface trait that objects can define
in order to signal how they should be serialized in the arrow format.
"""
module ArrowTypes
using Sockets
using UUIDs
export ArrowKind,
NullKind,
PrimitiveKind,
BoolKind,
ListKind,
FixedSizeListKind,
MapKind,
StructKind,
UnionKind,
DictEncodedKind,
toarrow,
arrowname,
fromarrow,
ToArrow
"""
ArrowTypes.ArrowKind(T)
For a give type `T`, define it's "arrow type kind", or the general category of arrow types it should be treated as. Must be one of:
* [`ArrowTypes.NullKind`](@ref): `Missing` is the only type defined as `NullKind`
* [`ArrowTypes.PrimitiveKind`](@ref): `<:Integer`, `<:AbstractFloat`, along with `Arrow.Decimal`, and the various `Arrow.ArrowTimeType` subtypes
* [`ArrowTypes.BoolKind`](@ref): only `Bool`
* [`ArrowTypes.ListKind`](@ref): any `AbstractString` or `AbstractArray`
* [`ArrowTypes.FixedSizeList`](@ref): `NTuple{N, T}`
* [`ArrowTypes.MapKind`](@ref): any `AbstractDict`
* [`ArrowTypes.StructKind`](@ref): any `NamedTuple` or plain struct (mutable or otherwise)
* [`ArrowTypes.UnionKind`](@ref): any `Union`
* [`ArrowTypes.DictEncodedKind`](@ref): array types that implement the `DataAPI.refpool` interface
The list of `ArrowKind`s listed above translate to different ways to physically store data as supported by the arrow data format.
See the docs for each for an idea of whether they might be an appropriate fit for a custom type.
Note that custom types need to satisfy any additional "interface methods" as required by the various `ArrowKind`
types. By default, if a type in julia is declared like `primitive type ...` it is considered a `PrimitiveKind`
and if `struct` or `mutable struct` it's considered a `StructKind`. Also note that types will rarely need to define `ArrowKind`;
much more common is to define `ArrowType(T)` and `toarrow(x::T)` to transform `T` to a natively supported arrow type, which will
already have its `ArrowKind` defined.
"""
abstract type ArrowKind end
ArrowKind(x::T) where {T} = ArrowKind(T)
ArrowKind(::Type{T}) where {T} = isprimitivetype(T) ? PrimitiveKind() : StructKind()
"""
ArrowTypes.ArrowType(T) = S
Interface method to define the natively supported arrow type `S` that a given type `T` should be converted to before serializing.
Useful when a custom type wants a "serialization hook" or otherwise needs to be transformed/converted into a natively
supported arrow type for serialization. If a type defines `ArrowType`, it must also define a corresponding
[`ArrowTypes.toarrow(x::T)`](@ref) method which does the actual conversion from `T` to `S`.
Note that custom structs defined like `struct T` or `mutable struct T` are natively supported in serialization, so unless
_additional_ transformation/customization is desired, a custom type `T` can serialize with no `ArrowType` definition (by default,
each field of a struct is serialized, using the results of `fieldnames(T)` and `getfield(x, i)`).
Note that defining these methods only deal with custom _serialization_ to the arrow format; to be able to _deserialize_ custom
types at all, see the docs for [`ArrowTypes.arrowname`](@ref), [`ArrowTypes.arrowmetadata`](@ref), [`ArrowTypes.JuliaType`](@ref),
and [`ArrowTypes.fromarrow`](@ref).
"""
function ArrowType end
ArrowType(::Type{T}) where {T} = T
ArrowType(::Type{Any}) = Any
ArrowType(::Type{Union{Missing,T}}) where {T} = Union{Missing,ArrowType(T)}
ArrowType(::Type{Missing}) = Missing
"""
ArrowTypes.toarrow(x::T) => S
Interface method to perform the actual conversion from an object `x` of type `T` to the type `S`. `T` and `S` must match the
types used when defining `ArrowTypes.ArrowType(::Type{T}) = S`. Hence, `S` is the natively supported arrow type that `T`
desires to convert to to enable serialization. See [`ArrowTypes.ArrowType`](@ref) docs for more details.
This enables custom objects to be serialized as a natively supported arrow data type.
"""
function toarrow end
toarrow(x) = x
"""
ArrowTypes.arrowname(T) = Symbol(name)
Interface method to define the logical type "label" for a custom Julia type `T`. Names will be global for an entire arrow dataset,
and conventionally, custom types will just use their type name along with a Julia- and package-specific prefix; for example,
for a custom type `Foo`, I would define `ArrowTypes.arrowname(::Type{Foo}) = Symbol("JuliaLang.MyPackage.Foo")`.
This ensures other language implementations won't get confused and are safe to ignore the logical type label.
When arrow stores non-native data, it must still be _stored_ as a native data type, but can have type metadata tied to the data that
labels the original _logical_ type it originated from. This enables the conversion of native data back to the logical type when
deserializing, as long as the deserializer has the same definitions when the data was serialized. Namely, the current Julia
session will need the appropriate [`ArrowTypes.JuliaType`](@ref) and [`ArrowTypes.fromarrow`](@ref) definitions in order to know
how to convert the native data to the original logical type. See the docs for those interface methods in order to ensure a complete
implementation. Also see the accompanying [`ArrowTypes.arrowmetadata`](@ref) docs around providing additional metadata about a custom
logical type that may be necessary to fully re-create a Julia type (e.g. non-field-based type parameters).
"""
function arrowname end
const EMPTY_SYMBOL = Symbol()
arrowname(T) = EMPTY_SYMBOL
hasarrowname(T) = arrowname(T) !== EMPTY_SYMBOL
arrowname(::Type{Union{T,Missing}}) where {T} = arrowname(T)
arrowname(::Type{Union{T,Nothing}}) where {T} = arrowname(T)
arrowname(::Type{Missing}) = EMPTY_SYMBOL
arrowname(::Type{Any}) = EMPTY_SYMBOL
"""
ArrowTypes.arrowmetadata(T) => String
Interface method to provide additional logical type metadata when serializing extension types. [`ArrowTypes.arrowname`](@ref)
provides the logical type _name_, which may be all that's needed to return a proper Julia type from [`ArrowTypes.JuliaType`](@ref),
but some custom types may, for example have type parameters that aren't inferred/based on fields. In order to fully recreate these
kinds of types when deserializing, these type parameters can be stored by defining `ArrowTypes.arrowmetadata(::Type{T}) = "type_param"`.
This will then be available to access by overloading `ArrowTypes.JuliaType(::Val{Symbol(name)}, S, arrowmetadata::String)`.
"""
function arrowmetadata end
const EMPTY_STRING = ""
arrowmetadata(T) = EMPTY_STRING
arrowmetadata(::Type{Union{T,Missing}}) where {T} = arrowmetadata(T)
arrowmetadata(::Type{Union{T,Nothing}}) where {T} = arrowmetadata(T)
arrowmetadata(::Type{Nothing}) = EMPTY_STRING
arrowmetadata(::Type{Missing}) = EMPTY_STRING
arrowmetadata(::Type{Any}) = EMPTY_STRING
"""
ArrowTypes.JuliaType(::Val{Symbol(name)}, ::Type{S}, arrowmetadata::String) = T
Interface method to define the custom Julia logical type `T` that a serialized metadata label should be converted to when
deserializing. When reading arrow data, and a logical type label is encountered for a column, it will call
`ArrowTypes.JuliaType(Val(Symbol(name)), S, arrowmetadata)` to see if a Julia type has been "registered" for deserialization. The `name`
used when defining the method *must* correspond to the same `name` when defining `ArrowTypes.arrowname(::Type{T}) = Symbol(name)`.
The use of `Val(Symbol(...))` is to allow overloading a method on a specific logical type label. The `S` 2nd argument passed to
`JuliaType` is the native arrow serialized type. This can be useful for parametric Julia types that wish to correctly parameterize
their custom type based on what was serialized. The 3rd argument `arrowmetadata` is any metadata that was stored when the logical
type was serialized as the result of calling `ArrowTypes.arrowmetadata(T)`. Note the 2nd and 3rd arguments are optional when
overloading if unneeded.
When defining [`ArrowTypes.arrowname`](@ref) and `ArrowTypes.JuliaType`, you may also want to implement [`ArrowTypes.fromarrow`]
in order to customize how a custom type `T` should be constructed from the native arrow data type. See its docs for more details.
"""
function JuliaType end
JuliaType(val) = nothing
JuliaType(val, S) = JuliaType(val)
JuliaType(val, S, meta) = JuliaType(val, S)
"""
ArrowTypes.fromarrow(::Type{T}, x::S) => T
Interface method that provides a "deserialization hook" for a custom type `T` to be constructed from the native arrow type `S`.
The `T` and `S` types must correspond to the definitions used in `ArrowTypes.ArrowType(::Type{T}) = S`. This is a paired method
with [`ArrowTypes.toarrow`](@ref).
The default definition is `ArrowTypes.fromarrow(::Type{T}, x) = T(x)`, so if that works for a custom type already, no additional
overload is necessary.
A few `ArrowKind`s have/allow slightly more custom overloads for their `fromarrow` methods:
* `ListKind{true}`: for `String` types, they may overload `fromarrow(::Type{T}, ptr::Ptr{UInt8}, len::Int) = ...` to avoid
materializing a `String`
* `StructKind`:
* May overload `fromarrow(::Type{T}, x...)` where individual fields are passed as separate
positional arguments; so if my custom type `Interval` has two fields `first` and `last`, then I'd overload like
`ArrowTypes.fromarrow(::Type{Interval}, first, last) = ...`. Note the default implementation is
`ArrowTypes.fromarrow(::Type{T}, x...) = T(x...)`, so if your type already accepts all arguments in a constructor
no additional `fromarrow` method should be necessary (default struct constructors have this behavior).
* Alternatively, may overload `fromarrowstruct(::Type{T}, ::Val{fnames}, x...)`, where `fnames` is a tuple of the
field names corresponding to the values in `x`. This approach is useful when you need to implement deserialization
in a manner that is agnostic to the field order used by the serializer. When implemented, `fromarrowstruct` takes precedence over `fromarrow` in `StructKind` deserialization.
"""
function fromarrow end
fromarrow(::Type{T}, x::T) where {T} = x
fromarrow(::Type{T}, x...) where {T} = T(x...)
fromarrow(::Type{Union{Missing,T}}, ::Missing) where {T} = missing
fromarrow(::Type{Union{Missing,T}}, x::T) where {T} = x
fromarrow(::Type{Union{Missing,T}}, x::T) where {T<:NamedTuple} = x # ambiguity fix
fromarrow(::Type{Union{Missing,T}}, x) where {T} = fromarrow(T, x)
"NullKind data is actually not physically stored since the data is constant; just the length is needed"
struct NullKind <: ArrowKind end
ArrowKind(::Type{Missing}) = NullKind()
ArrowKind(::Type{Nothing}) = NullKind()
ArrowType(::Type{Nothing}) = Missing
toarrow(::Nothing) = missing
const NOTHING = Symbol("JuliaLang.Nothing")
arrowname(::Type{Nothing}) = NOTHING
JuliaType(::Val{NOTHING}) = Nothing
fromarrow(::Type{Nothing}, ::Missing) = nothing
"PrimitiveKind data is stored as plain bits in a single contiguous buffer"
struct PrimitiveKind <: ArrowKind end
ArrowKind(::Type{<:Integer}) = PrimitiveKind()
ArrowKind(::Type{<:AbstractFloat}) = PrimitiveKind()
ArrowType(::Type{Char}) = UInt32
toarrow(x::Char) = convert(UInt32, x)
const CHAR = Symbol("JuliaLang.Char")
arrowname(::Type{Char}) = CHAR
JuliaType(::Val{CHAR}) = Char
fromarrow(::Type{Char}, x::UInt32) = Char(x)
"BoolKind data is stored with values packed down to individual bits; so instead of a traditional Bool being 1 byte/8 bits, 8 Bool values would be packed into a single byte"
struct BoolKind <: ArrowKind end
ArrowKind(::Type{Bool}) = BoolKind()
"ListKind data are stored in two separate buffers; one buffer contains all the original data elements flattened into one long buffer; the 2nd buffer contains an offset into the 1st buffer for how many elements make up the original array element"
struct ListKind{stringtype} <: ArrowKind end
ListKind() = ListKind{false}()
isstringtype(::ListKind{stringtype}) where {stringtype} = stringtype
isstringtype(::Type{ListKind{stringtype}}) where {stringtype} = stringtype
ArrowKind(::Type{<:AbstractString}) = ListKind{true}()
# Treate Base.CodeUnits as Binary arrow type
ArrowKind(::Type{<:Base.CodeUnits}) = ListKind{true}()
fromarrow(::Type{T}, ptr::Ptr{UInt8}, len::Int) where {T} =
fromarrow(T, unsafe_string(ptr, len))
fromarrow(::Type{T}, x) where {T<:Base.CodeUnits} = Base.CodeUnits(x)
fromarrow(::Type{Union{Missing,Base.CodeUnits}}, x) =
x === missing ? missing : Base.CodeUnits(x)
ArrowType(::Type{Symbol}) = String
toarrow(x::Symbol) = String(x)
const SYMBOL = Symbol("JuliaLang.Symbol")
arrowname(::Type{Symbol}) = SYMBOL
JuliaType(::Val{SYMBOL}) = Symbol
_symbol(ptr, len) = ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), ptr, len)
fromarrow(::Type{Symbol}, ptr::Ptr{UInt8}, len::Int) = _symbol(ptr, len)
ArrowKind(::Type{<:AbstractArray}) = ListKind()
fromarrow(::Type{A}, x::A) where {A<:AbstractVector{T}} where {T} = x
fromarrow(::Type{A}, x::AbstractVector{T}) where {A<:AbstractVector{T}} where {T} =
convert(A, x)
ArrowKind(::Type{<:AbstractSet}) = ListKind()
ArrowType(::Type{T}) where {T<:AbstractSet{S}} where {S} = Vector{S}
toarrow(x::AbstractSet) = collect(x)
const SET = Symbol("JuliaLang.Set")
arrowname(::Type{<:AbstractSet}) = SET
JuliaType(::Val{SET}, ::Type{T}) where {T<:AbstractVector{S}} where {S} = Set{S}
fromarrow(::Type{T}, x) where {T<:AbstractSet} = T(x)
"FixedSizeListKind data are stored in a single contiguous buffer; individual elements can be computed based on the fixed size of the lists"
struct FixedSizeListKind{N,T} <: ArrowKind end
gettype(::FixedSizeListKind{N,T}) where {N,T} = T
getsize(::FixedSizeListKind{N,T}) where {N,T} = N
ArrowKind(::Type{NTuple{N,T}}) where {N,T} = FixedSizeListKind{N,T}()
ArrowKind(::Type{UUID}) = FixedSizeListKind{16,UInt8}()
ArrowType(::Type{UUID}) = NTuple{16,UInt8}
toarrow(x::UUID) = _cast(NTuple{16,UInt8}, x.value)
const UUIDSYMBOL = Symbol("JuliaLang.UUID")
arrowname(::Type{UUID}) = UUIDSYMBOL
JuliaType(::Val{UUIDSYMBOL}) = UUID
fromarrow(::Type{UUID}, x::NTuple{16,UInt8}) = UUID(_cast(UInt128, x))
ArrowKind(::Type{IPv4}) = PrimitiveKind()
ArrowType(::Type{IPv4}) = UInt32
toarrow(x::IPv4) = x.host
const IPV4_SYMBOL = Symbol("JuliaLang.IPv4")
arrowname(::Type{IPv4}) = IPV4_SYMBOL
JuliaType(::Val{IPV4_SYMBOL}) = IPv4
fromarrow(::Type{IPv4}, x::Integer) = IPv4(x)
ArrowKind(::Type{IPv6}) = FixedSizeListKind{16,UInt8}()
ArrowType(::Type{IPv6}) = NTuple{16,UInt8}
toarrow(x::IPv6) = _cast(NTuple{16,UInt8}, x.host)
const IPV6_SYMBOL = Symbol("JuliaLang.IPv6")
arrowname(::Type{IPv6}) = IPV6_SYMBOL
JuliaType(::Val{IPV6_SYMBOL}) = IPv6
fromarrow(::Type{IPv6}, x::NTuple{16,UInt8}) = IPv6(_cast(UInt128, x))
function _cast(::Type{Y}, x)::Y where {Y}
y = Ref{Y}()
_unsafe_cast!(y, Ref(x), 1)
return y[]
end
function _unsafe_cast!(y::Ref{Y}, x::Ref, n::Integer) where {Y}
X = eltype(x)
GC.@preserve x y begin
ptr_x = Base.unsafe_convert(Ptr{X}, x)
ptr_y = Base.unsafe_convert(Ptr{Y}, y)
unsafe_copyto!(Ptr{X}(ptr_y), ptr_x, n)
end
return y
end
"StructKind data are stored in separate buffers for each field of the struct"
struct StructKind <: ArrowKind end
ArrowKind(::Type{<:NamedTuple}) = StructKind()
@inline fromarrowstruct(T::Type, ::Val, x...) = fromarrow(T, x...)
fromarrow(
::Type{NamedTuple{names,types}},
x::NamedTuple{names,types},
) where {names,types<:Tuple} = x
fromarrow(::Type{T}, x::NamedTuple) where {T} = fromarrow(T, Tuple(x)...)
ArrowKind(::Type{<:Tuple}) = StructKind()
ArrowKind(::Type{Tuple{}}) = StructKind()
const TUPLE = Symbol("JuliaLang.Tuple")
# needed to disambiguate the FixedSizeList case for NTuple
arrowname(::Type{NTuple{N,T}}) where {N,T} = EMPTY_SYMBOL
arrowname(::Type{T}) where {T<:Tuple} = TUPLE
arrowname(::Type{Tuple{}}) = TUPLE
JuliaType(::Val{TUPLE}, ::Type{NamedTuple{names,types}}) where {names,types<:Tuple} = types
fromarrow(::Type{T}, x::NamedTuple) where {T<:Tuple} = Tuple(x)
# VersionNumber
const VERSION_NUMBER = Symbol("JuliaLang.VersionNumber")
ArrowKind(::Type{VersionNumber}) = StructKind()
arrowname(::Type{VersionNumber}) = VERSION_NUMBER
JuliaType(::Val{VERSION_NUMBER}) = VersionNumber
default(::Type{VersionNumber}) = v"0"
function fromarrow(::Type{VersionNumber}, v::NamedTuple)
VersionNumber(v.major, v.minor, v.patch, v.prerelease, v.build)
end
"MapKind data are stored similarly to ListKind, where elements are flattened, and a 2nd offsets buffer contains the individual list element length data"
struct MapKind <: ArrowKind end
ArrowKind(::Type{<:AbstractDict}) = MapKind()
"UnionKind data are stored either in a separate, compacted buffer for each union type (dense), or in full-length buffers for each union type (sparse)"
struct UnionKind <: ArrowKind end
ArrowKind(::Union) = UnionKind()
"DictEncodedKind store a small pool of unique values in one buffer, with a full-length buffer of integer offsets into the small value pool"
struct DictEncodedKind <: ArrowKind end
"""
There are a couple places when writing arrow buffers where
we need to write a "dummy" value; it doesn't really matter
what we write, but we need to write something of a specific
type. So each supported writing type needs to define `default`.
"""
function default end
default(T) = zero(T)
default(::Type{Symbol}) = Symbol()
default(::Type{Char}) = '\0'
default(::Type{<:AbstractString}) = ""
default(::Type{Any}) = nothing
default(::Type{Missing}) = missing
default(::Type{Nothing}) = nothing
default(::Type{Union{T,Missing}}) where {T} = default(T)
default(::Type{Union{T,Nothing}}) where {T} = default(T)
default(::Type{Union{T,Missing,Nothing}}) where {T} = default(T)
function default(::Type{A}) where {A<:AbstractVector{T}} where {T}
a = similar(A, 1)
a[1] = default(T)
return a
end
default(::Type{SubArray{T,N,P,I,L}}) where {T,N,P,I,L} = view(default(P), 0:-1)
default(::Type{NTuple{N,T}}) where {N,T} = ntuple(i -> default(T), N)
default(::Type{Tuple{}}) = ()
function default(::Type{T}) where {T<:Tuple}
T === Tuple{} && return ()
N = Base.isvarargtype(T.parameters[end]) ? length(T.parameters) - 1 : fieldcount(T)
return Tuple(default(fieldtype(T, i)) for i = 1:N)
end
default(::Type{T}) where {T<:AbstractDict} = T()
default(::Type{NamedTuple{names,types}}) where {names,types} =
NamedTuple{names}(Tuple(default(fieldtype(types, i)) for i = 1:length(names)))
function promoteunion(T, S)
new = promote_type(T, S)
return isabstracttype(new) ? Union{T,S} : new
end
# lazily call toarrow(x) on getindex for each x in data
struct ToArrow{T,A} <: AbstractVector{T}
data::A
end
concrete_or_concreteunion(T) =
isconcretetype(T) ||
(T isa Union && concrete_or_concreteunion(T.a) && concrete_or_concreteunion(T.b))
function ToArrow(x::A) where {A}
S = eltype(A)
T = ArrowType(S)
fi = firstindex(x)
if S === T && concrete_or_concreteunion(S) && fi == 1
return x
elseif !concrete_or_concreteunion(T)
# arrow needs concrete types, so try to find a concrete common type, preferring unions
if isempty(x)
return Missing[]
end
T = mapreduce(typeof ∘ toarrow, promoteunion, x)
if T === Missing && concrete_or_concreteunion(S)
T = promoteunion(T, typeof(toarrow(default(S))))
end
end
return ToArrow{T,A}(x)
end
Base.IndexStyle(::Type{<:ToArrow}) = Base.IndexLinear()
Base.size(x::ToArrow) = (length(x.data),)
Base.eltype(::Type{TA}) where {T,A,TA<:ToArrow{T,A}} = T
function _convert(::Type{T}, x) where {T}
if x isa T
return x
elseif T isa Union
# T was a promoted Union and x is not already one of
# the concrete Union types, so we need to just try
# to convert, recursively, to one of the Union types
# unfortunately not much we can do more efficiently here
try
return _convert(T.a, x)
catch
return _convert(T.b, x)
end
else
return convert(T, x)
end
end
Base.getindex(x::ToArrow{T}, i::Int) where {T} =
_convert(T, toarrow(getindex(x.data, i + firstindex(x.data) - 1)))
end # module ArrowTypes
================================================
FILE: src/ArrowTypes/test/Project.toml
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[deps]
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
================================================
FILE: src/ArrowTypes/test/runtests.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
using Test, ArrowTypes, UUIDs, Sockets, OffsetArrays
include("tests.jl")
================================================
FILE: src/ArrowTypes/test/tests.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
primitive type MyInt 32 end
struct Person
id::Int
name::String
end
@testset "ArrowTypes" begin
@test ArrowTypes.ArrowKind(MyInt) == ArrowTypes.PrimitiveKind()
@test ArrowTypes.ArrowKind(Person) == ArrowTypes.StructKind()
@test ArrowTypes.ArrowKind(Person(0, "bob")) == ArrowTypes.StructKind()
@test ArrowTypes.ArrowType(Int) == Int
@test ArrowTypes.ArrowType(Union{Int,Missing}) == Union{Int,Missing}
@test ArrowTypes.ArrowType(Missing) == Missing
@test ArrowTypes.toarrow(1) === 1
@test ArrowTypes.arrowname(Int) == Symbol()
@test !ArrowTypes.hasarrowname(Int)
@test ArrowTypes.arrowmetadata(Int) == ""
@test ArrowTypes.arrowmetadata(Union{Nothing,Int}) == ""
@test ArrowTypes.arrowmetadata(Union{Missing,Int}) == ""
@test ArrowTypes.JuliaType(1) === nothing
@test ArrowTypes.JuliaType(1, Int) === nothing
@test ArrowTypes.JuliaType(1, Int, nothing) === nothing
@test ArrowTypes.fromarrow(Int, 1) === 1
@test ArrowTypes.fromarrow(Person, 1, "bob") == Person(1, "bob")
@test ArrowTypes.fromarrow(Union{Int,Missing}, missing) === missing
@test ArrowTypes.fromarrow(Union{Int,Missing}, 1) === 1
@test ArrowTypes.fromarrow(Union{Float64,Missing}, 1) === 1.0
@test ArrowTypes.ArrowKind(Missing) == ArrowTypes.NullKind()
@test ArrowTypes.ArrowKind(Nothing) == ArrowTypes.NullKind()
@test ArrowTypes.ArrowType(Nothing) == Missing
@test ArrowTypes.toarrow(nothing) === missing
@test ArrowTypes.arrowname(Nothing) == ArrowTypes.NOTHING
@test ArrowTypes.JuliaType(Val(ArrowTypes.NOTHING)) == Nothing
@test ArrowTypes.fromarrow(Nothing, missing) === nothing
@test ArrowTypes.ArrowKind(Int) == ArrowTypes.PrimitiveKind()
@test ArrowTypes.ArrowKind(Float64) == ArrowTypes.PrimitiveKind()
@test ArrowTypes.ArrowType(Char) == UInt32
@test ArrowTypes.toarrow('1') == UInt32('1')
@test ArrowTypes.arrowname(Char) == ArrowTypes.CHAR
@test ArrowTypes.JuliaType(Val(ArrowTypes.CHAR)) == Char
@test ArrowTypes.fromarrow(Char, UInt32('1')) == '1'
@test ArrowTypes.ArrowKind(Bool) == ArrowTypes.BoolKind()
@test ArrowTypes.ListKind() == ArrowTypes.ListKind{false}()
@test !ArrowTypes.isstringtype(ArrowTypes.ListKind())
@test !ArrowTypes.isstringtype(typeof(ArrowTypes.ListKind()))
@test ArrowTypes.ArrowKind(String) == ArrowTypes.ListKind{true}()
@test ArrowTypes.ArrowKind(Base.CodeUnits) == ArrowTypes.ListKind{true}()
hey = collect(b"hey")
@test ArrowTypes.fromarrow(String, pointer(hey), 3) == "hey"
@test ArrowTypes.fromarrow(Base.CodeUnits, pointer(hey), 3) == b"hey"
@test ArrowTypes.fromarrow(Union{Base.CodeUnits,Missing}, pointer(hey), 3) == b"hey"
@test ArrowTypes.ArrowType(Symbol) == String
@test ArrowTypes.toarrow(:hey) == "hey"
@test ArrowTypes.arrowname(Symbol) == ArrowTypes.SYMBOL
@test ArrowTypes.JuliaType(Val(ArrowTypes.SYMBOL)) == Symbol
@test ArrowTypes.fromarrow(Symbol, pointer(hey), 3) == :hey
@test ArrowTypes.ArrowKind(Vector{Int}) == ArrowTypes.ListKind()
@test ArrowTypes.ArrowKind(Set{Int}) == ArrowTypes.ListKind()
@test ArrowTypes.ArrowType(Set{Int}) == Vector{Int}
@test typeof(ArrowTypes.toarrow(Set([1, 2, 3]))) <: Vector{Int}
@test ArrowTypes.arrowname(Set{Int}) == ArrowTypes.SET
@test ArrowTypes.JuliaType(Val(ArrowTypes.SET), Vector{Int}) == Set{Int}
@test ArrowTypes.fromarrow(Set{Int}, [1, 2, 3]) == Set([1, 2, 3])
K = ArrowTypes.ArrowKind(NTuple{3,UInt8})
@test ArrowTypes.gettype(K) == UInt8
@test ArrowTypes.getsize(K) == 3
@test K == ArrowTypes.FixedSizeListKind{3,UInt8}()
u = UUID(rand(UInt128))
ubytes = ArrowTypes._cast(NTuple{16,UInt8}, u.value)
@test ArrowTypes.ArrowKind(u) == ArrowTypes.FixedSizeListKind{16,UInt8}()
@test ArrowTypes.ArrowType(UUID) == NTuple{16,UInt8}
@test ArrowTypes.toarrow(u) == ubytes
@test ArrowTypes.arrowname(UUID) == ArrowTypes.UUIDSYMBOL
@test ArrowTypes.JuliaType(Val(ArrowTypes.UUIDSYMBOL)) == UUID
@test ArrowTypes.fromarrow(UUID, ubytes) == u
ip4 = IPv4(rand(UInt32))
@test ArrowTypes.ArrowKind(ip4) == PrimitiveKind()
@test ArrowTypes.ArrowType(IPv4) == UInt32
@test ArrowTypes.toarrow(ip4) == ip4.host
@test ArrowTypes.arrowname(IPv4) == ArrowTypes.IPV4_SYMBOL
@test ArrowTypes.JuliaType(Val(ArrowTypes.IPV4_SYMBOL)) == IPv4
@test ArrowTypes.fromarrow(IPv4, ip4.host) == ip4
ip6 = IPv6(rand(UInt128))
ip6_ubytes = ArrowTypes._cast(NTuple{16,UInt8}, ip6.host)
@test ArrowTypes.ArrowKind(ip6) == ArrowTypes.FixedSizeListKind{16,UInt8}()
@test ArrowTypes.ArrowType(IPv6) == NTuple{16,UInt8}
@test ArrowTypes.toarrow(ip6) == ip6_ubytes
@test ArrowTypes.arrowname(IPv6) == ArrowTypes.IPV6_SYMBOL
@test ArrowTypes.JuliaType(Val(ArrowTypes.IPV6_SYMBOL)) == IPv6
@test ArrowTypes.fromarrow(IPv6, ip6_ubytes) == ip6
nt = (id=1, name="bob")
@test ArrowTypes.ArrowKind(NamedTuple) == ArrowTypes.StructKind()
@test ArrowTypes.fromarrow(typeof(nt), nt) === nt
@test ArrowTypes.fromarrow(Person, nt) == Person(1, "bob")
@test ArrowTypes.ArrowKind(Tuple) == ArrowTypes.StructKind()
@test ArrowTypes.ArrowKind(Tuple{}) == ArrowTypes.StructKind()
@test ArrowTypes.arrowname(Tuple{Int,String}) == ArrowTypes.TUPLE
@test ArrowTypes.arrowname(Tuple{}) == ArrowTypes.TUPLE
@test ArrowTypes.JuliaType(
Val(ArrowTypes.TUPLE),
NamedTuple{(Symbol("1"), Symbol("2")),Tuple{Int,String}},
) == Tuple{Int,String}
@test ArrowTypes.fromarrow(Tuple{Int,String}, nt) == (1, "bob")
@test ArrowTypes.fromarrow(Union{Missing,typeof(nt)}, nt) == nt
# #461
@test ArrowTypes.default(Tuple{}) == ()
@test ArrowTypes.default(Tuple{Vararg{Int}}) == ()
@test ArrowTypes.default(Tuple{String,Vararg{Int}}) == ("",)
v = v"1"
v_nt = (major=1, minor=0, patch=0, prerelease=(), build=())
@test ArrowTypes.ArrowKind(VersionNumber) == ArrowTypes.StructKind()
@test ArrowTypes.arrowname(VersionNumber) == ArrowTypes.VERSION_NUMBER
@test ArrowTypes.JuliaType(Val(ArrowTypes.VERSION_NUMBER)) == VersionNumber
@test ArrowTypes.fromarrow(typeof(v), v_nt) == v
@test ArrowTypes.default(VersionNumber) == v"0"
@test ArrowTypes.ArrowKind(Dict{String,Int}) == ArrowTypes.MapKind()
@test ArrowTypes.ArrowKind(Union{String,Int}) == ArrowTypes.UnionKind()
@test ArrowTypes.default(Int) == Int(0)
@test ArrowTypes.default(Symbol) == Symbol()
@test ArrowTypes.default(Char) == '\0'
@test ArrowTypes.default(String) == ""
@test ArrowTypes.default(Missing) === missing
@test ArrowTypes.default(Nothing) === nothing
@test ArrowTypes.default(Union{Int,Missing}) == Int(0)
@test ArrowTypes.default(Union{Int,Nothing}) == Int(0)
@test ArrowTypes.default(Union{Int,Missing,Nothing}) == Int(0)
@test ArrowTypes.promoteunion(Int, Float64) == Float64
@test ArrowTypes.promoteunion(Int, String) == Union{Int,String}
@test ArrowTypes.concrete_or_concreteunion(Int)
@test !ArrowTypes.concrete_or_concreteunion(Union{Real,String})
@test !ArrowTypes.concrete_or_concreteunion(Any)
@testset "ToArrow" begin
x = ArrowTypes.ToArrow([1, 2, 3])
@test x isa Vector{Int}
@test x == [1, 2, 3]
x = ArrowTypes.ToArrow([:hey, :ho])
@test x isa ArrowTypes.ToArrow{String,Vector{Symbol}}
@test eltype(x) == String
@test x == ["hey", "ho"]
x = ArrowTypes.ToArrow(Any[1, 3.14])
@test x isa ArrowTypes.ToArrow{Float64,Vector{Any}}
@test eltype(x) == Float64
@test x == [1.0, 3.14]
x = ArrowTypes.ToArrow(Any[1, 3.14, "hey"])
@test x isa ArrowTypes.ToArrow{Union{Float64,String},Vector{Any}}
@test eltype(x) == Union{Float64,String}
@test x == [1.0, 3.14, "hey"]
x = ArrowTypes.ToArrow(OffsetArray([1, 2, 3], -3:-1))
@test x isa ArrowTypes.ToArrow{Int,OffsetVector{Int,Vector{Int}}}
@test eltype(x) == Int
@test x == [1, 2, 3]
x = ArrowTypes.ToArrow(OffsetArray(Any[1, 3.14], -3:-2))
@test x isa ArrowTypes.ToArrow{Float64,OffsetVector{Any,Vector{Any}}}
@test eltype(x) == Float64
@test x == [1, 3.14]
@testset "respect non-missing concrete type" begin
struct DateTimeTZ
instant::Int64
tz::String
end
struct Timestamp{TZ}
x::Int64
end
ArrowTypes.ArrowType(::Type{DateTimeTZ}) = Timestamp
ArrowTypes.toarrow(x::DateTimeTZ) = Timestamp{Symbol(x.tz)}(x.instant)
ArrowTypes.default(::Type{DateTimeTZ}) = DateTimeTZ(0, "UTC")
T = Union{DateTimeTZ,Missing}
@test !ArrowTypes.concrete_or_concreteunion(ArrowTypes.ArrowType(T))
@test eltype(ArrowTypes.ToArrow(T[missing])) == Union{Timestamp{:UTC},Missing}
# Works since `ArrowTypes.default(Any) === nothing` and
# `ArrowTypes.toarrow(nothing) === missing`. Defining `toarrow(::Nothing) = nothing`
# would break this test by returning `Union{Nothing,Missing}`.
@test eltype(ArrowTypes.ToArrow(Any[missing])) == Missing
end
@testset "ignore non-missing abstract type" begin
x = ArrowTypes.ToArrow(Union{Missing,Array{Int}}[missing])
@test x isa ArrowTypes.ToArrow{Missing,Vector{Union{Missing,Array{Int64}}}}
@test eltype(x) == Missing
@test isequal(x, [missing])
end
end
end
================================================
FILE: src/FlatBuffers/FlatBuffers.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
module FlatBuffers
import Base: ==
const UOffsetT = UInt32
const SOffsetT = Int32
const VOffsetT = UInt16
const VtableMetadataFields = 2
basetype(::Enum) = UInt8
basetype(::Type{T}) where {T<:Enum{S}} where {S} = S
function readbuffer(t::AbstractVector{UInt8}, pos::Integer, ::Type{Bool})
@inbounds b = t[pos + 1]
return b === 0x01
end
function readbuffer(t::AbstractVector{UInt8}, pos::Integer, ::Type{T}) where {T}
GC.@preserve t begin
ptr = convert(Ptr{T}, pointer(t, pos + 1))
x = unsafe_load(ptr)
end
end
include("builder.jl")
include("table.jl")
function Base.show(io::IO, x::TableOrStruct)
print(io, "$(typeof(x))")
if isempty(propertynames(x))
print(io, "()")
else
show(
io,
NamedTuple{propertynames(x)}(
Tuple(getproperty(x, y) for y in propertynames(x)),
),
)
end
end
end # module
================================================
FILE: src/FlatBuffers/builder.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
const fileIdentifierLength = 4
"""
Scalar
A Union of the Julia types `T <: Number` that are allowed in FlatBuffers schema
"""
const Scalar =
Union{Bool,Int8,Int16,Int32,Int64,UInt8,UInt16,UInt32,UInt64,Float32,Float64,Enum}
"""
Builder is a state machine for creating FlatBuffer objects.
Use a Builder to construct object(s) starting from leaf nodes.
A Builder constructs byte buffers in a last-first manner for simplicity and
performance.
"""
mutable struct Builder
bytes::Vector{UInt8}
minalign::Int
vtable::Vector{UOffsetT}
objectend::UOffsetT
vtables::Vector{UOffsetT}
head::UOffsetT
nested::Bool
finished::Bool
sharedstrings::Dict{String,UOffsetT}
end
bytes(b::Builder) = getfield(b, :bytes)
Builder(size=0) = Builder(
zeros(UInt8, size),
1,
UOffsetT[],
UOffsetT(0),
UOffsetT[],
UOffsetT(size),
false,
false,
Dict{String,UOffsetT}(),
)
function reset!(b::Builder)
empty!(b.bytes)
empty!(b.vtable)
emtpy!(b.vtables)
empty!(b.sharedstrings)
b.minalign = 1
b.nested = false
b.finished = false
b.head = 0
return
end
Base.write(sink::Builder, o, x::Union{Bool,UInt8}) = sink.bytes[o + 1] = UInt8(x)
function Base.write(sink::Builder, off, x::T) where {T}
off += 1
for (i, ind) in enumerate(off:(off + sizeof(T) - 1))
sink.bytes[ind] = (x >> ((i - 1) * 8)) % UInt8
end
end
Base.write(b::Builder, o, x::Float32) = write(b, o, reinterpret(UInt32, x))
Base.write(b::Builder, o, x::Float64) = write(b, o, reinterpret(UInt64, x))
Base.write(b::Builder, o, x::Enum) = write(b, o, basetype(x)(x))
"""
`finishedbytes` returns a pointer to the written data in the byte buffer.
Panics if the builder is not in a finished state (which is caused by calling
`finish!()`).
"""
function finishedbytes(b::Builder)
assertfinished(b)
return view(b.bytes, (b.head + 1):length(b.bytes))
end
function startobject!(b::Builder, numfields)
assertnotnested(b)
b.nested = true
resize!(b.vtable, numfields)
fill!(b.vtable, 0)
b.objectend = offset(b)
return
end
"""
WriteVtable serializes the vtable for the current object, if applicable.
Before writing out the vtable, this checks pre-existing vtables for equality
to this one. If an equal vtable is found, point the object to the existing
vtable and return.
Because vtable values are sensitive to alignment of object data, not all
logically-equal vtables will be deduplicated.
A vtable has the following format:
<VOffsetT: size of the vtable in bytes, including this value>
<VOffsetT: size of the object in bytes, including the vtable offset>
<VOffsetT: offset for a field> * N, where N is the number of fields in
the schema for this type. Includes deprecated fields.
Thus, a vtable is made of 2 + N elements, each SizeVOffsetT bytes wide.
An object has the following format:
<SOffsetT: offset to this object's vtable (may be negative)>
<byte: data>+
"""
function writevtable!(b::Builder)
# Prepend a zero scalar to the object. Later in this function we'll
# write an offset here that points to the object's vtable:
prepend!(b, SOffsetT(0))
objectOffset = offset(b)
existingVtable = UOffsetT(0)
# Trim vtable of trailing zeroes.
i = findlast(!iszero, b.vtable)
if i !== nothing
resize!(b.vtable, i)
end
# Search backwards through existing vtables, because similar vtables
# are likely to have been recently appended. See
# BenchmarkVtableDeduplication for a case in which this heuristic
# saves about 30% of the time used in writing objects with duplicate
# tables.
for i = length(b.vtables):-1:1
# Find the other vtable, which is associated with `i`:
vt2Offset = b.vtables[i]
vt2Start = length(b.bytes) - vt2Offset
vt2Len = readbuffer(b.bytes, vt2Start, VOffsetT)
metadata = VtableMetadataFields * sizeof(VOffsetT)
vt2End = vt2Start + vt2Len
vt2 = view(b.bytes, (vt2Start + metadata + 1):vt2End) #TODO: might need a +1 on the start of range here
# Compare the other vtable to the one under consideration.
# If they are equal, store the offset and break:
if vtableEqual(b.vtable, objectOffset, vt2)
existingVtable = vt2Offset
break
end
end
if existingVtable == 0
# Did not find a vtable, so write this one to the buffer.
# Write out the current vtable in reverse , because
# serialization occurs in last-first order:
for i = length(b.vtable):-1:1
off::UOffsetT = 0
if b.vtable[i] != 0
# Forward reference to field;
# use 32bit number to assert no overflow:
off = objectOffset - b.vtable[i]
end
prepend!(b, VOffsetT(off))
end
# The two metadata fields are written last.
# First, store the object bytesize:
objectSize = objectOffset - b.objectend
prepend!(b, VOffsetT(objectSize))
# Second, store the vtable bytesize:
vbytes = (length(b.vtable) + VtableMetadataFields) * sizeof(VOffsetT)
prepend!(b, VOffsetT(vbytes))
# Next, write the offset to the new vtable in the
# already-allocated SOffsetT at the beginning of this object:
objectStart = SOffsetT(length(b.bytes) - objectOffset)
write(b, objectStart, SOffsetT(offset(b) - objectOffset))
# Finally, store this vtable in memory for future
# deduplication:
push!(b.vtables, offset(b))
else
# Found a duplicate vtable.
objectStart = SOffsetT(length(b.bytes) - objectOffset)
b.head = objectStart
# Write the offset to the found vtable in the
# already-allocated SOffsetT at the beginning of this object:
write(b, b.head, SOffsetT(existingVtable) - SOffsetT(objectOffset))
end
empty!(b.vtable)
return objectOffset
end
"""
`endobject` writes data necessary to finish object construction.
"""
function endobject!(b::Builder)
assertnested(b)
n = writevtable!(b)
b.nested = false
return n
end
offset(b::Builder) = UOffsetT(length(b.bytes) - b.head)
pad!(b::Builder, n) = foreach(x -> place!(b, 0x00), 1:n)
"""
`prep!` prepares to write an element of `size` after `additionalbytes`
have been written, e.g. if you write a string, you need to align such
the int length field is aligned to sizeof(Int32), and the string data follows it
directly.
If all you need to do is align, `additionalbytes` will be 0.
"""
function prep!(b::Builder, size, additionalbytes)
# Track the biggest thing we've ever aligned to.
if size > b.minalign
b.minalign = size
end
# Find the amount of alignment needed such that `size` is properly
# aligned after `additionalBytes`:
alignsize = xor(Int(-1), (length(b.bytes) - b.head) + additionalbytes) + 1
alignsize &= (size - 1)
# Reallocate the buffer if needed:
totalsize = alignsize + size + additionalbytes
if b.head <= totalsize
len = length(b.bytes)
prepend!(b.bytes, zeros(UInt8, totalsize))
b.head += length(b.bytes) - len
end
pad!(b, alignsize)
return
end
function Base.prepend!(b::Builder, x::T) where {T}
prep!(b, sizeof(T), 0)
place!(b, x)
return
end
function prependoffset!(b::Builder, off)
prep!(b, sizeof(Int32), 0) # Ensure alignment is already done.
if !(off <= offset(b))
throw(ArgumentError("unreachable: $off <= $(offset(b))"))
end
place!(b, SOffsetT(offset(b) - off + sizeof(SOffsetT)))
return
end
function prependoffsetslot!(b::Builder, o::Int, x::T, d) where {T}
if x != T(d)
prependoffset!(b, x)
slot!(b, o)
end
return
end
"""
`startvector` initializes bookkeeping for writing a new vector.
A vector has the following format:
<UOffsetT: number of elements in this vector>
<T: data>+, where T is the type of elements of this vector.
"""
function startvector!(b::Builder, elemSize, numElems, alignment)
assertnotnested(b)
b.nested = true
prep!(b, sizeof(UInt32), elemSize * numElems)
prep!(b, alignment, elemSize * numElems)
return offset(b)
end
"""
`endvector` writes data necessary to finish vector construction.
"""
function endvector!(b::Builder, vectorNumElems)
assertnested(b)
place!(b, UOffsetT(vectorNumElems))
b.nested = false
return offset(b)
end
function createsharedstring!(b::Builder, s::AbstractString)
get!(b.sharedstrings, s) do
createstring!(b, s)
end
end
"""
`createstring!` writes a null-terminated string as a vector.
"""
function createstring!(b::Builder, s::Union{AbstractString,AbstractVector{UInt8}})
assertnotnested(b)
b.nested = true
s = codeunits(s)
prep!(b, sizeof(UInt32), sizeof(s) + 1)
place!(b, UInt8(0))
l = sizeof(s)
b.head -= l
copyto!(b.bytes, b.head + 1, s, 1, l)
return endvector!(b, sizeof(s))
end
createbytevector(b::Builder, v) = createstring!(b, v)
function assertnested(b::Builder)
# If you get this assert, you're in an object while trying to write
# data that belongs outside of an object.
# To fix this, write non-inline data (like vectors) before creating
# objects.
if !b.nested
throw(ArgumentError("Incorrect creation order: must be inside object."))
end
return
end
function assertnotnested(b::Builder)
# If you hit this, you're trying to construct a Table/Vector/String
# during the construction of its parent table (between the MyTableBuilder
# and builder.Finish()).
# Move the creation of these view-objects to above the MyTableBuilder to
# not get this assert.
# Ignoring this assert may appear to work in simple cases, but the reason
# it is here is that storing objects in-line may cause vtable offsets
# to not fit anymore. It also leads to vtable duplication.
if b.nested
throw(ArgumentError("Incorrect creation order: object must not be nested."))
end
return
end
function assertfinished(b::Builder)
# If you get this assert, you're attempting to get access a buffer
# which hasn't been finished yet. Be sure to call builder.Finish()
# with your root table.
# If you really need to access an unfinished buffer, use the bytes
# buffer directly.
if !b.finished
throw(ArgumentError("Incorrect use of FinishedBytes(): must call 'Finish' first."))
end
end
"""
`prependslot!` prepends a `T` onto the object at vtable slot `o`.
If value `x` equals default `d`, then the slot will be set to zero and no
other data will be written.
"""
function prependslot!(b::Builder, o::Int, x::T, d, sh=false) where {T<:Scalar}
if x != T(d)
prepend!(b, x)
slot!(b, o)
end
return
end
"""
`prependstructslot!` prepends a struct onto the object at vtable slot `o`.
Structs are stored inline, so nothing additional is being added.
In generated code, `d` is always 0.
"""
function prependstructslot!(b::Builder, voffset, x, d)
if x != d
assertnested(b)
if x != offset(b)
throw(ArgumentError("inline data write outside of object"))
end
slot!(b, voffset)
end
return
end
"""
`slot!` sets the vtable key `voffset` to the current location in the buffer.
"""
function slot!(b::Builder, slotnum)
b.vtable[slotnum + 1] = offset(b)
end
# FinishWithFileIdentifier finalizes a buffer, pointing to the given `rootTable`.
# as well as applys a file identifier
function finishwithfileidentifier(b::Builder, rootTable, fid)
if length(fid) != fileIdentifierLength
error("incorrect file identifier length")
end
# In order to add a file identifier to the flatbuffer message, we need
# to prepare an alignment and file identifier length
prep!(b, b.minalign, sizeof(Int32) + fileIdentifierLength)
for i = fileIdentifierLength:-1:1
# place the file identifier
place!(b, fid[i])
end
# finish
finish!(b, rootTable)
end
"""
`finish!` finalizes a buffer, pointing to the given `rootTable`.
"""
function finish!(b::Builder, rootTable)
assertnotnested(b)
prep!(b, b.minalign, sizeof(UOffsetT))
prependoffset!(b, UOffsetT(rootTable))
b.finished = true
return
end
"vtableEqual compares an unwritten vtable to a written vtable."
function vtableEqual(a::Vector{UOffsetT}, objectStart, b::AbstractVector{UInt8})
if length(a) * sizeof(VOffsetT) != length(b)
return false
end
for i = 0:(length(a) - 1)
x = read(IOBuffer(view(b, (i * sizeof(VOffsetT) + 1):length(b))), VOffsetT)
# Skip vtable entries that indicate a default value.
x == 0 && a[i + 1] == 0 && continue
y = objectStart - a[i + 1]
x != y && return false
end
return true
end
"""
`place!` prepends a `T` to the Builder, without checking for space.
"""
function place!(b::Builder, x::T) where {T}
b.head -= sizeof(T)
write(b, b.head, x)
return
end
================================================
FILE: src/FlatBuffers/table.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Table
The object containing the flatbuffer and positional information specific to the table.
The `vtable` containing the offsets for specific members precedes `pos`.
The actual values in the table follow `pos` offset and size of the vtable.
- `bytes::Vector{UInt8}`: the flatbuffer itself
- `pos::Integer`: the base position in `bytes` of the table
"""
abstract type Table end
abstract type Struct end
const TableOrStruct = Union{Table,Struct}
bytes(x::TableOrStruct) = getfield(x, :bytes)
pos(x::TableOrStruct) = getfield(x, :pos)
==(a::T, b::T) where {T<:TableOrStruct} =
all(getproperty(a, p) == getproperty(b, p) for p in propertynames(a))
(::Type{T})(b::Builder) where {T<:TableOrStruct} =
T(b.bytes[(b.head + 1):end], get(b, b.head, Int32))
getrootas(::Type{T}, bytes::Vector{UInt8}, offset) where {T<:Table} =
init(T, bytes, offset + readbuffer(bytes, offset, UOffsetT))
init(::Type{T}, bytes::Vector{UInt8}, pos::Integer) where {T<:TableOrStruct} = T(bytes, pos)
const TableOrBuilder = Union{Table,Struct,Builder}
Base.get(t::TableOrBuilder, pos, ::Type{T}) where {T} = readbuffer(bytes(t), pos, T)
Base.get(t::TableOrBuilder, pos, ::Type{T}) where {T<:Enum} = T(get(t, pos, basetype(T)))
"""
`offset` provides access into the Table's vtable.
Deprecated fields are ignored by checking against the vtable's length.
"""
function offset(t::Table, vtableoffset)
vtable = pos(t) - get(t, pos(t), SOffsetT)
return vtableoffset < get(t, vtable, VOffsetT) ?
get(t, vtable + vtableoffset, VOffsetT) : VOffsetT(0)
end
"`indirect` retrieves the relative offset stored at `offset`."
indirect(t::Table, off) = off + get(t, off, UOffsetT)
getvalue(t, o, ::Type{Nothing}) = nothing
getvalue(t, o, ::Type{T}) where {T<:Scalar} = get(t, pos(t) + o, T)
getvalue(t, o, ::Type{T}) where {T<:Enum} = T(get(t, pos(t) + o, enumtype(T)))
function Base.String(t::Table, off)
off += get(t, off, UOffsetT)
start = off + sizeof(UOffsetT)
len = get(t, off, UOffsetT)
return unsafe_string(pointer(bytes(t), start + 1), len)
end
function bytevector(t::Table, off)
off += get(t, off, UOffsetT)
start = off + sizeof(UOffsetT)
len = get(t, off, UOffsetT)
return view(bytes(t), (start + 1):(start + len + 1))
end
"""
`vectorlen` retrieves the length of the vector whose offset is stored at
`off` in this object.
"""
function vectorlen(t::Table, off)
off += pos(t)
off += get(t, off, UOffsetT)
return Int(get(t, off, UOffsetT))
end
"""
`vector` retrieves the start of data of the vector whose offset is stored
at `off` in this object.
"""
function vector(t::Table, off)
off += pos(t)
x = off + get(t, off, UOffsetT)
# data starts after metadata containing the vector length
return x + sizeof(UOffsetT)
end
struct Array{T,S,TT} <: AbstractVector{T}
_tab::TT
pos::Int64
data::Vector{S}
end
function Array{T}(t::Table, off) where {T}
a = vector(t, off)
S = T <: Table ? UOffsetT : T <: Struct ? NTuple{structsizeof(T),UInt8} : T
ptr = convert(Ptr{S}, pointer(bytes(t), a + 1))
data = unsafe_wrap(Base.Array, ptr, vectorlen(t, off))
return Array{T,S,typeof(t)}(t, a, data)
end
function structsizeof end
Base.IndexStyle(::Type{<:Array}) = Base.IndexLinear()
Base.size(x::Array) = size(x.data)
Base.@propagate_inbounds function Base.getindex(A::Array{T,S}, i::Integer) where {T,S}
if T === S
return A.data[i]
elseif T <: Struct
return init(T, bytes(A._tab), A.pos + (i - 1) * structsizeof(T))
else # T isa Table
return init(T, bytes(A._tab), indirect(A._tab, A.pos + (i - 1) * 4))
end
end
Base.@propagate_inbounds function Base.setindex!(A::Array{T,S}, v, i::Integer) where {T,S}
if T === S
return setindex!(A.data, v, i)
else
error("setindex! not supported for reference/table types")
end
end
function union(t::Table, off)
off += pos(t)
return off + get(t, off, UOffsetT)
end
function union!(t::Table, t2::Table, off)
off += pos(t)
t2.pos = off + get(t, off, UOffsetT)
t2.bytes = bytes(t)
return
end
"""
GetVOffsetTSlot retrieves the VOffsetT that the given vtable location
points to. If the vtable value is zero, the default value `d`
will be returned.
"""
function getoffsetslot(t::Table, slot, d)
off = offset(t, slot)
if off == 0
return d
end
return off
end
"""
`getslot` retrieves the `T` that the given vtable location
points to. If the vtable value is zero, the default value `d`
will be returned.
"""
function getslot(t::Table, slot, d::T) where {T}
off = offset(t, slot)
if off == 0
return d
end
return get(t, pos(t) + off, T)
end
================================================
FILE: src/append.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Arrow.append(io::IO, tbl)
Arrow.append(file::String, tbl)
tbl |> Arrow.append(file)
Append any [Tables.jl](https://github.com/JuliaData/Tables.jl)-compatible `tbl`
to an existing arrow formatted file or IO. The existing arrow data must be in
IPC stream format. Note that appending to the "feather formatted file" is _not_
allowed, as this file format doesn't support appending. That means files written
like `Arrow.write(filename::String, tbl)` _cannot_ be appended to; instead, you
should write like `Arrow.write(filename::String, tbl; file=false)`.
When an IO object is provided to be written on to, it must support seeking. For
example, a file opened in `r+` mode or an `IOBuffer` that is readable, writable
and seekable can be appended to, but not a network stream.
Multiple record batches will be written based on the number of
`Tables.partitions(tbl)` that are provided; by default, this is just
one for a given table, but some table sources support automatic
partitioning. Note you can turn multiple table objects into partitions
by doing `Tables.partitioner([tbl1, tbl2, ...])`, but note that
each table must have the exact same `Tables.Schema`.
By default, `Arrow.append` will use multiple threads to write multiple
record batches simultaneously (e.g. if julia is started with `julia -t 8`
or the `JULIA_NUM_THREADS` environment variable is set).
Supported keyword arguments to `Arrow.append` include:
* `alignment::Int=8`: specify the number of bytes to align buffers to when written in messages; strongly recommended to only use alignment values of 8 or 64 for modern memory cache line optimization
* `colmetadata=nothing`: the metadata that should be written as the table's columns' `custom_metadata` fields; must either be `nothing` or an `AbstractDict` of `column_name::Symbol => column_metadata` where `column_metadata` is an iterable of `<:AbstractString` pairs.
* `dictencode::Bool=false`: whether all columns should use dictionary encoding when being written; to dict encode specific columns, wrap the column/array in `Arrow.DictEncode(col)`
* `dictencodenested::Bool=false`: whether nested data type columns should also dict encode nested arrays/buffers; other language implementations [may not support this](https://arrow.apache.org/docs/status.html)
* `denseunions::Bool=true`: whether Julia `Vector{<:Union}` arrays should be written using the dense union layout; passing `false` will result in the sparse union layout
* `largelists::Bool=false`: causes list column types to be written with Int64 offset arrays; mainly for testing purposes; by default, Int64 offsets will be used only if needed
* `maxdepth::Int=$DEFAULT_MAX_DEPTH`: deepest allowed nested serialization level; this is provided by default to prevent accidental infinite recursion with mutually recursive data structures
* `metadata=Arrow.getmetadata(tbl)`: the metadata that should be written as the table's schema's `custom_metadata` field; must either be `nothing` or an iterable of `<:AbstractString` pairs.
* `ntasks::Int`: number of concurrent threaded tasks to allow while writing input partitions out as arrow record batches; default is no limit; to disable multithreaded writing, pass `ntasks=1`
* `convert::Bool`: whether certain arrow primitive types in the schema of `file` should be converted to Julia defaults for matching them to the schema of `tbl`; by default, `convert=true`.
* `file::Bool`: applicable when an `IO` is provided, whether it is a file; by default `file=false`.
"""
function append end
append(io_or_file; kw...) = x -> append(io_or_file, x; kw...)
function append(file::String, tbl; kwargs...)
open(file, isfile(file) ? "r+" : "w+") do io
append(io, tbl; file=true, kwargs...)
end
return file
end
function append(
io::IO,
tbl;
metadata=getmetadata(tbl),
colmetadata=nothing,
largelists::Bool=false,
denseunions::Bool=true,
dictencode::Bool=false,
dictencodenested::Bool=false,
alignment::Int=8,
maxdepth::Int=DEFAULT_MAX_DEPTH,
ntasks=Inf,
convert::Bool=true,
file::Bool=false,
)
if ntasks < 1
throw(
ArgumentError(
"ntasks keyword argument must be > 0; pass `ntasks=1` to disable multithreaded writing",
),
)
end
startpos = position(io)
seekend(io)
len = position(io) - startpos
seek(io, startpos) # leave the stream position unchanged
if len == 0 # empty file, not initialized, we can just write to it
kwargs = Dict{Symbol,Any}(
:largelists => largelists,
:denseunions => denseunions,
:dictencode => dictencode,
:dictencodenested => dictencodenested,
:alignment => alignment,
:maxdepth => maxdepth,
:metadata => metadata,
:colmetadata => colmetadata,
)
if isa(ntasks, Integer)
kwargs[:ntasks] = ntasks
end
write(io, tbl; kwargs...)
else
isstream, arrow_schema, compress = stream_properties(io; convert=convert)
if !isstream
throw(ArgumentError("append is supported only to files in arrow stream format"))
end
if compress isa Symbol && compress !== :lz4 && compress !== :zstd
throw(
ArgumentError(
"unsupported compress keyword argument value: $compress. Valid values include `:lz4` or `:zstd`",
),
)
end
append(
io,
tbl,
arrow_schema,
compress,
largelists,
denseunions,
dictencode,
dictencodenested,
alignment,
maxdepth,
ntasks,
metadata,
colmetadata,
)
end
return io
end
function append(
io::IO,
source,
arrow_schema,
compress,
largelists,
denseunions,
dictencode,
dictencodenested,
alignment,
maxdepth,
ntasks,
meta,
colmeta,
)
seekend(io)
skip(io, -8) # overwrite last 8 bytes of last empty message footer
sch = Ref{Tables.Schema}(arrow_schema)
sync = OrderedSynchronizer()
msgs = Channel{Message}(ntasks)
dictencodings = Dict{Int64,Any}() # Lockable{DictEncoding}
# build messages
blocks = (Block[], Block[])
# start message writing from channel
threaded = ntasks > 1
tsk =
threaded ? (@wkspawn for msg in msgs
Base.write(io, msg, blocks, sch, alignment)
end) : (@async for msg in msgs
Base.write(io, msg, blocks, sch, alignment)
end)
anyerror = Threads.Atomic{Bool}(false)
errorref = Ref{Any}()
@sync for (i, tbl) in enumerate(Tables.partitions(source))
if anyerror[]
@error "error writing arrow data on partition = $(errorref[][3])" exception =
(errorref[][1], errorref[][2])
error("fatal error writing arrow data")
end
@debug "processing table partition i = $i"
tbl_cols = Tables.columns(tbl)
tbl_schema = Tables.schema(tbl_cols)
if !is_equivalent_schema(arrow_schema, tbl_schema)
throw(ArgumentError("Table schema does not match existing arrow file schema"))
end
if threaded
@wkspawn process_partition(
tbl_cols,
dictencodings,
largelists,
compress,
denseunions,
dictencode,
dictencodenested,
maxdepth,
sync,
msgs,
alignment,
i,
sch,
errorref,
anyerror,
meta,
colmeta,
)
else
@async process_partition(
tbl_cols,
dictencodings,
largelists,
compress,
denseunions,
dictencode,
dictencodenested,
maxdepth,
sync,
msgs,
alignment,
i,
sch,
errorref,
anyerror,
meta,
colmeta,
)
end
end
if anyerror[]
@error "error writing arrow data on partition = $(errorref[][3])" exception =
(errorref[][1], errorref[][2])
error("fatal error writing arrow data")
end
# close our message-writing channel, no further put!-ing is allowed
close(msgs)
# now wait for our message-writing task to finish writing
wait(tsk)
Base.write(
io,
Message(UInt8[], nothing, 0, true, false, Meta.Schema),
blocks,
sch,
alignment,
)
return io
end
function stream_properties(io::IO; convert::Bool=true)
startpos = position(io)
buff = similar(FILE_FORMAT_MAGIC_BYTES)
start_magic = read!(io, buff) == FILE_FORMAT_MAGIC_BYTES
seekend(io)
len = position(io) - startpos
skip(io, -length(FILE_FORMAT_MAGIC_BYTES))
end_magic = read!(io, buff) == FILE_FORMAT_MAGIC_BYTES
seek(io, startpos) # leave the stream position unchanged
isstream = !(len > 24 && start_magic && end_magic)
if isstream
stream = Stream(io, convert=convert)
for table in stream
# no need to scan further once we get compression information
(stream.compression[] !== nothing) && break
end
seek(io, startpos) # leave the stream position unchanged
return isstream, Tables.Schema(stream.names, stream.types), stream.compression[]
else
return isstream, nothing, nothing
end
end
function is_equivalent_schema(sch1::Tables.Schema, sch2::Tables.Schema)
(sch1.names == sch2.names) || (return false)
for (t1, t2) in zip(sch1.types, sch2.types)
tt1 = Base.nonmissingtype(t1)
tt2 = Base.nonmissingtype(t2)
if t1 == t2
continue
elseif tt1 <: AbstractVector && tt2 <: AbstractVector && eltype(tt1) == eltype(tt2)
continue
elseif isstructtype(tt1) && isstructtype(tt2)
is_equivalent_type_by_field(tt1, tt2)
else
return false
end
end
true
end
function is_equivalent_type_by_field(T1, T2)
n1 = fieldcount(T1)
n2 = fieldcount(T2)
n1 != n2 && return false
for i = 1:n1
fieldname(T1, i) == fieldname(T2, i) || return false
if fieldtype(T1, i) == fieldtype(T2, i)
continue
elseif isstructtype(T1) && isstructtype(T2)
is_equivalent_type_by_field(T1, T2) || continue
else
return false
end
end
true
end
================================================
FILE: src/arraytypes/arraytypes.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Arrow.ArrowVector
An abstract type that subtypes `AbstractVector`. Each specific arrow array type
subtypes `ArrowVector`. See [`BoolVector`](@ref), [`Primitive`](@ref), [`List`](@ref),
[`Map`](@ref), [`FixedSizeList`](@ref), [`Struct`](@ref), [`DenseUnion`](@ref),
[`SparseUnion`](@ref), and [`DictEncoded`](@ref) for more details.
"""
abstract type ArrowVector{T} <: AbstractVector{T} end
Base.IndexStyle(::Type{A}) where {A<:ArrowVector} = Base.IndexLinear()
Base.similar(::Type{A}, dims::Dims) where {T,A<:ArrowVector{T}} = Vector{T}(undef, dims)
validitybitmap(x::ArrowVector) = x.validity
nullcount(x::ArrowVector) = validitybitmap(x).nc
getmetadata(x::ArrowVector) = x.metadata
Base.deleteat!(x::T, inds) where {T<:ArrowVector} = throw(
ArgumentError("`$T` does not support `deleteat!`; arrow data is by nature immutable"),
)
function toarrowvector(
x,
i=1,
de=Dict{Int64,Any}(),
ded=DictEncoding[],
meta=getmetadata(x);
compression::Union{Nothing,Symbol,LZ4FrameCompressor,ZstdCompressor}=nothing,
kw...,
)
@debug "converting top-level column to arrow format: col = $(typeof(x)), compression = $compression, kw = $(values(kw))"
@debug x
A = arrowvector(x, i, 0, 0, de, ded, meta; compression=compression, kw...)
if compression isa LZ4FrameCompressor
A = compress(Meta.CompressionType.LZ4_FRAME, compression, A)
elseif compression isa ZstdCompressor
A = compress(Meta.CompressionType.ZSTD, compression, A)
elseif compression isa Symbol && compression == :lz4
comp = lz4_frame_compressor()
A = Base.@lock comp begin
compress(Meta.CompressionType.LZ4_FRAME, comp[], A)
end
elseif compression isa Symbol && compression == :zstd
comp = zstd_compressor()
A = Base.@lock comp begin
compress(Meta.CompressionType.ZSTD, comp[], A)
end
end
@debug "converted top-level column to arrow format: $(typeof(A))"
@debug A
return A
end
function arrowvector(
x,
i,
nl,
fi,
de,
ded,
meta;
dictencoding::Bool=false,
dictencode::Bool=false,
maxdepth::Int=DEFAULT_MAX_DEPTH,
kw...,
)
if nl > maxdepth
error(
"reached nested serialization level ($nl) deeper than provided max depth argument ($(maxdepth)); to increase allowed nesting level, pass `maxdepth=X`",
)
end
T = maybemissing(eltype(x))
if !(x isa DictEncode) && !dictencoding && (dictencode || DataAPI.refarray(x) !== x)
x = DictEncode(x, dictencodeid(i, nl, fi))
elseif x isa DictEncoded
return arrowvector(
DictEncodeType,
x,
i,
nl,
fi,
de,
ded,
meta;
dictencode=dictencode,
kw...,
)
elseif !(x isa DictEncode)
x = ToArrow(x)
end
S = maybemissing(eltype(x))
if ArrowTypes.hasarrowname(T)
meta = _arrowtypemeta(
_normalizemeta(meta),
String(ArrowTypes.arrowname(T)),
String(ArrowTypes.arrowmetadata(T)),
)
end
return arrowvector(
S,
x,
i,
nl,
fi,
de,
ded,
meta;
dictencode=dictencode,
maxdepth=maxdepth,
kw...,
)
end
_normalizemeta(::Nothing) = nothing
_normalizemeta(meta) = toidict(String(k) => String(v) for (k, v) in meta)
_normalizecolmeta(::Nothing) = nothing
_normalizecolmeta(colmeta) = toidict(
Symbol(k) => toidict(String(v1) => String(v2) for (v1, v2) in v) for (k, v) in colmeta
)
function _arrowtypemeta(::Nothing, n, m)
return toidict(("ARROW:extension:name" => n, "ARROW:extension:metadata" => m))
end
function _arrowtypemeta(meta, n, m)
dict = Dict(meta)
dict["ARROW:extension:name"] = n
dict["ARROW:extension:metadata"] = m
return toidict(dict)
end
# now we check for ArrowType converions and dispatch on ArrowKind
function arrowvector(::Type{S}, x, i, nl, fi, de, ded, meta; kw...) where {S}
meta = _normalizemeta(meta)
return arrowvector(ArrowKind(S), x, i, nl, fi, de, ded, meta; kw...)
end
struct NullVector{T} <: ArrowVector{T}
data::MissingVector
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end
Base.size(v::NullVector) = (length(v.data),)
Base.getindex(v::NullVector{T}, i::Int) where {T} =
ArrowTypes.fromarrow(T, getindex(v.data, i))
arrowvector(::NullKind, x, i, nl, fi, de, ded, meta; kw...) = NullVector{eltype(x)}(
MissingVector(length(x)),
isnothing(meta) ? nothing : toidict(meta),
)
compress(Z::Meta.CompressionType.T, comp, v::NullVector) =
Compressed{Z,NullVector}(v, CompressedBuffer[], length(v), length(v), Compressed[])
function makenodesbuffers!(
col::NullVector,
fieldnodes,
fieldbuffers,
bufferoffset,
alignment,
)
push!(fieldnodes, FieldNode(length(col), length(col)))
@debug "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)"
return bufferoffset
end
function writebuffer(io, col::NullVector, alignment)
return
end
"""
Arrow.ValidityBitmap
A bit-packed array type where each bit corresponds to an element in an
[`ArrowVector`](@ref), indicating whether that element is "valid" (bit == 1),
or not (bit == 0). Used to indicate element missingness (whether it's null).
If the null count of an array is zero, the `ValidityBitmap` will be "empty"
and all elements are treated as "valid"/non-null.
"""
struct ValidityBitmap <: ArrowVector{Bool}
bytes::Vector{UInt8} # arrow memory blob
pos::Int # starting byte of validity bitmap
ℓ::Int # # of _elements_ (not bytes!) in bitmap (because bitpacking)
nc::Int # null count
end
Base.size(p::ValidityBitmap) = (p.ℓ,)
nullcount(x::ValidityBitmap) = x.nc
function ValidityBitmap(x)
T = eltype(x)
if !(T >: Missing)
return ValidityBitmap(UInt8[], 1, length(x), 0)
end
len = length(x)
blen = cld(len, 8)
bytes = Vector{UInt8}(undef, blen)
st = iterate(x)
nc = 0
b = 0xff
j = k = 1
for y in x
if y === missing
nc += 1
b = setbit(b, false, j)
end
j += 1
if j == 9
@inbounds bytes[k] = b
b = 0xff
j = 1
k += 1
end
end
if j > 1
bytes[k] = b
end
return ValidityBitmap(nc == 0 ? UInt8[] : bytes, 1, nc == 0 ? 0 : len, nc)
end
@propagate_inbounds function Base.getindex(p::ValidityBitmap, i::Integer)
# no boundscheck because parent array should do it
# if a validity bitmap is empty, it either means:
# 1) the parent array null_count is 0, so all elements are valid
# 2) parent array is also empty, so "all" elements are valid
p.nc == 0 && return true
# translate element index to bitpacked byte index
a, b = divrem(i - 1, 8) .+ (1, 1)
@inbounds byte = p.bytes[p.pos + a - 1]
# check individual bit of byte
return getbit(byte, b)
end
@propagate_inbounds function Base.setindex!(p::ValidityBitmap, v, i::Integer)
x = convert(Bool, v)
p.ℓ == 0 && !x && throw(BoundsError(p, i))
a, b = fldmod1(i, 8)
@inbounds byte = p.bytes[p.pos + a - 1]
@inbounds p.bytes[p.pos + a - 1] = setbit(byte, x, b)
return v
end
function writebitmap(io, col::ArrowVector, alignment)
v = col.validity
@debug "writing validity bitmap: nc = $(v.nc), n = $(cld(v.ℓ, 8))"
v.nc == 0 && return 0
n = Base.write(io, view(v.bytes, (v.pos):(v.pos + cld(v.ℓ, 8) - 1)))
return n + writezeros(io, paddinglength(n, alignment))
end
include("compressed.jl")
include("primitive.jl")
include("bool.jl")
include("list.jl")
include("fixedsizelist.jl")
include("map.jl")
include("struct.jl")
include("unions.jl")
include("dictencoding.jl")
include("views.jl")
================================================
FILE: src/arraytypes/bool.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Arrow.BoolVector
A bit-packed array type, similar to [`ValidityBitmap`](@ref), but which
holds boolean values, `true` or `false`.
"""
struct BoolVector{T} <: ArrowVector{T}
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
pos::Int
validity::ValidityBitmap
ℓ::Int64
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end
Base.size(p::BoolVector) = (p.ℓ,)
@propagate_inbounds function Base.getindex(p::BoolVector{T}, i::Integer) where {T}
@boundscheck checkbounds(p, i)
if T >: Missing
@inbounds !p.validity[i] && return missing
end
a, b = fldmod1(i, 8)
@inbounds byte = p.arrow[p.pos + a - 1]
# check individual bit of byte
return ArrowTypes.fromarrow(T, getbit(byte, b))
end
@propagate_inbounds function Base.setindex!(p::BoolVector, v, i::Integer)
@boundscheck checkbounds(p, i)
x = convert(Bool, v)
a, b = fldmod1(i, 8)
@inbounds byte = p.arrow[p.pos + a - 1]
@inbounds p.arrow[p.pos + a - 1] = setbit(byte, x, b)
return v
end
arrowvector(::BoolKind, x::BoolVector, i, nl, fi, de, ded, meta; kw...) = x
function arrowvector(::BoolKind, x, i, nl, fi, de, ded, meta; kw...)
validity = ValidityBitmap(x)
len = length(x)
blen = cld(len, 8)
bytes = Vector{UInt8}(undef, blen)
b = 0xff
j = k = 1
for y in x
if y === false
b = setbit(b, false, j)
end
j += 1
if j == 9
@inbounds bytes[k] = b
b = 0xff
j = 1
k += 1
end
end
if j > 1
bytes[k] = b
end
return BoolVector{eltype(x)}(bytes, 1, validity, len, meta)
end
function compress(Z::Meta.CompressionType.T, comp, p::P) where {P<:BoolVector}
len = length(p)
nc = nullcount(p)
validity = compress(Z, comp, p.validity)
data = compress(Z, comp, view(p.arrow, (p.pos):(p.pos + cld(p.ℓ, 8) - 1)))
return Compressed{Z,P}(p, [validity, data], len, nc, Compressed[])
end
function makenodesbuffers!(
col::BoolVector,
fieldnodes,
fieldbuffers,
bufferoffset,
alignment,
)
len = length(col)
nc = nullcount(col)
push!(fieldnodes, FieldNode(len, nc))
@debug "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)"
# validity bitmap
blen = nc == 0 ? 0 : bitpackedbytes(len, alignment)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
# adjust buffer offset, make primitive array buffer
bufferoffset += blen
blen = bitpackedbytes(len, alignment)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
return bufferoffset + blen
end
function writebuffer(io, col::BoolVector, alignment)
@debug "writebuffer: col = $(typeof(col))"
@debug col
writebitmap(io, col, alignment)
n = Base.write(io, view(col.arrow, (col.pos):(col.pos + cld(col.ℓ, 8) - 1)))
return n + writezeros(io, paddinglength(n, alignment))
end
================================================
FILE: src/arraytypes/compressed.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
struct CompressedBuffer
data::Vector{UInt8}
uncompressedlength::Int64
end
"""
Arrow.Compressed
Represents the compressed version of an [`ArrowVector`](@ref).
Holds a reference to the original column. May have `Compressed`
children for nested array types.
"""
struct Compressed{Z,A}
data::A
buffers::Vector{CompressedBuffer}
len::Int64
nullcount::Int64
children::Vector{Compressed}
end
Base.length(c::Compressed) = c.len
Base.eltype(::Type{C}) where {Z,A,C<:Compressed{Z,A}} = eltype(A)
getmetadata(x::Compressed) = getmetadata(x.data)
compressiontype(c::Compressed{Z}) where {Z} = Z
function compress(Z::Meta.CompressionType.T, comp, x::Array)
GC.@preserve x begin
y = unsafe_wrap(Array, convert(Ptr{UInt8}, pointer(x)), sizeof(x))
return CompressedBuffer(transcode(comp, y), length(y))
end
end
compress(Z::Meta.CompressionType.T, comp, x) = compress(Z, comp, convert(Array, x))
compress(Z::Meta.CompressionType.T, comp, v::ValidityBitmap) =
v.nc == 0 ? CompressedBuffer(UInt8[], 0) :
compress(Z, comp, view(v.bytes, (v.pos):(v.pos + cld(v.ℓ, 8) - 1)))
function makenodesbuffers!(
col::Compressed,
fieldnodes,
fieldbuffers,
bufferoffset,
alignment,
)
push!(fieldnodes, FieldNode(col.len, col.nullcount))
@debug "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)"
for buffer in col.buffers
blen = length(buffer.data) == 0 ? 0 : 8 + length(buffer.data)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
bufferoffset += padding(blen, alignment)
end
for child in col.children
bufferoffset =
makenodesbuffers!(child, fieldnodes, fieldbuffers, bufferoffset, alignment)
end
return bufferoffset
end
function writearray(io, b::CompressedBuffer)
if length(b.data) > 0
n = Base.write(io, b.uncompressedlength)
@debug "writing compressed buffer: uncompressedlength = $(b.uncompressedlength), n = $(length(b.data))"
@debug b.data
return n + Base.write(io, b.data)
end
return 0
end
function writebuffer(io, col::Compressed, alignment)
@debug "writebuffer: col = $(typeof(col))"
@debug col
for buffer in col.buffers
n = writearray(io, buffer)
writezeros(io, paddinglength(n, alignment))
end
for child in col.children
writebuffer(io, child, alignment)
end
return
end
================================================
FILE: src/arraytypes/dictencoding.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Arrow.DictEncoding
Represents the "pool" of possible values for a [`DictEncoded`](@ref)
array type. Whether the order of values is significant can be checked
by looking at the `isOrdered` boolean field.
The `S` type parameter, while not tied directly to any field, is the
signed integer "index type" of the parent DictEncoded. We keep track
of this in the DictEncoding in order to validate the length of the pool
doesn't exceed the index type limit. The general workflow of writing arrow
data means the initial schema will typically be based off the data in the
first record batch, and subsequent record batches need to match the same
schema exactly. For example, if a non-first record batch dict encoded column
were to cause a DictEncoding pool to overflow on unique values, a fatal error
should be thrown.
"""
mutable struct DictEncoding{T,S,A} <: ArrowVector{T}
id::Int64
data::A
isOrdered::Bool
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end
indextype(::Type{DictEncoding{T,S,A}}) where {T,S,A} = S
indextype(::T) where {T<:DictEncoding} = indextype(T)
Base.size(d::DictEncoding) = size(d.data)
@propagate_inbounds function Base.getindex(d::DictEncoding{T}, i::Integer) where {T}
@boundscheck checkbounds(d, i)
return @inbounds ArrowTypes.fromarrow(T, d.data[i])
end
# convenience wrapper to signal that an input column should be
# dict encoded when written to the arrow format
struct DictEncodeType{T} end
getT(::Type{DictEncodeType{T}}) where {T} = T
"""
Arrow.DictEncode(::AbstractVector, id::Integer=nothing)
Signals that a column/array should be dictionary encoded when serialized
to the arrow streaming/file format. An optional `id` number may be provided
to signal that multiple columns should use the same pool when being
dictionary encoded.
"""
struct DictEncode{T,A} <: AbstractVector{DictEncodeType{T}}
id::Int64
data::A
end
DictEncode(x::A, id=-1) where {A} = DictEncode{eltype(A),A}(id, x)
Base.IndexStyle(::Type{<:DictEncode}) = Base.IndexLinear()
Base.size(x::DictEncode) = (length(x.data),)
Base.iterate(x::DictEncode, st...) = iterate(x.data, st...)
Base.getindex(x::DictEncode, i::Int) = getindex(x.data, i)
ArrowTypes.ArrowKind(::Type{<:DictEncodeType}) = DictEncodedKind()
Base.copy(x::DictEncode) = DictEncode(x.data, x.id)
"""
Arrow.DictEncoded
A dictionary encoded array type (similar to a `PooledArray`). Behaves just
like a normal array in most respects; internally, possible values are stored
in the `encoding::DictEncoding` field, while the `indices::Vector{<:Integer}`
field holds the "codes" of each element for indexing into the encoding pool.
Any column/array can be dict encoding when serializing to the arrow format
either by passing the `dictencode=true` keyword argument to [`Arrow.write`](@ref)
(which causes _all_ columns to be dict encoded), or wrapping individual columns/
arrays in [`Arrow.DictEncode(x)`](@ref).
"""
struct DictEncoded{T,S,A} <: ArrowVector{T}
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
validity::ValidityBitmap
indices::Vector{S}
encoding::DictEncoding{T,S,A}
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end
DictEncoded(
b::Vector{UInt8},
v::ValidityBitmap,
inds::Vector{S},
encoding::DictEncoding{T,S,A},
meta,
) where {S,T,A} = DictEncoded{T,S,A}(b, v, inds, encoding, meta)
Base.size(d::DictEncoded) = size(d.indices)
isdictencoded(d::DictEncoded) = true
isdictencoded(x) = false
isdictencoded(c::Compressed{Z,A}) where {Z,A<:DictEncoded} = true
function signedtype(n::Integer)
typs = (Int8, Int16, Int32, Int64)
typs[something(findfirst(n .≤ typemax.(typs)), 4)]
end
signedtype(::Type{UInt8}) = Int8
signedtype(::Type{UInt16}) = Int16
signedtype(::Type{UInt32}) = Int32
signedtype(::Type{UInt64}) = Int64
signedtype(::Type{T}) where {T<:Signed} = T
indtype(d::DictEncoded{T,S,A}) where {T,S,A} = S
indtype(c::Compressed{Z,A}) where {Z,A<:DictEncoded} = indtype(c.data)
dictencodeid(colidx, nestedlevel, fieldid) =
(Int64(nestedlevel) << 48) | (Int64(fieldid) << 32) | Int64(colidx)
getid(d::DictEncoded) = d.encoding.id
getid(c::Compressed{Z,A}) where {Z,A<:DictEncoded} = c.data.encoding.id
function arrowvector(
::DictEncodedKind,
x::DictEncoded,
i,
nl,
fi,
de,
ded,
meta;
dictencode::Bool=false,
dictencodenested::Bool=false,
kw...,
)
id = x.encoding.id
# XXX This is a race condition if two workers hit this block at the same time, then they'll create
# distinct locks
if !haskey(de, id)
de[id] = Lockable(x.encoding)
else
encodinglockable = de[id]
Base.@lock encodinglockable begin
encoding = encodinglockable.value
# in this case, we just need to check if any values in our local pool need to be delta dicationary serialized
deltas = setdiff(x.encoding, encoding)
if !isempty(deltas)
ET = indextype(encoding)
if length(deltas) + length(encoding) > typemax(ET)
error(
"fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET",
)
end
data = arrowvector(
deltas,
i,
nl,
fi,
de,
ded,
nothing;
dictencode=dictencodenested,
dictencodenested=dictencodenested,
dictencoding=true,
kw...,
)
push!(
ded,
DictEncoding{eltype(data),ET,typeof(data)}(
id,
data,
false,
getmetadata(data),
),
)
if typeof(encoding.data) <: ChainedVector
append!(encoding.data, data)
else
data2 = ChainedVector([encoding.data, data])
encoding = DictEncoding{eltype(data2),ET,typeof(data2)}(
id,
data2,
false,
getmetadata(encoding),
)
de[id] = Lockable(encoding)
end
end
end
end
return x
end
function arrowvector(
::DictEncodedKind,
x,
i,
nl,
fi,
de,
ded,
meta;
dictencode::Bool=false,
dictencodenested::Bool=false,
kw...,
)
@assert x isa DictEncode
id = x.id == -1 ? dictencodeid(i, nl, fi) : x.id
x = x.data
len = length(x)
validity = ValidityBitmap(x)
# XXX This is a race condition if two workers hit this block at the same time, then they'll create
# distinct locks
if !haskey(de, id)
# dict encoding doesn't exist yet, so create for 1st time
if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing
# need to encode ourselves
x = PooledArray(x; signed=true, compress=true)
inds = refa = DataAPI.refarray(x)
pool = DataAPI.refpool(x)
else
pool = DataAPI.refpool(x)
refa = DataAPI.refarray(x)
inds = copyto!(similar(Vector{signedtype(length(pool))}, length(refa)), refa)
end
# adjust to "offset" instead of index
inds .-= firstindex(refa)
data = arrowvector(
pool,
i,
nl,
fi,
de,
ded,
nothing;
dictencode=dictencodenested,
dictencodenested=dictencodenested,
dictencoding=true,
kw...,
)
encoding = DictEncoding{eltype(data),eltype(inds),typeof(data)}(
id,
data,
false,
getmetadata(data),
)
de[id] = Lockable(encoding)
else
# encoding already exists
# compute inds based on it
# if value doesn't exist in encoding, push! it
# also add to deltas updates
encodinglockable = de[id]
Base.@lock encodinglockable begin
encoding = encodinglockable.value
len = length(x)
ET = indextype(encoding)
pool = Dict{Union{eltype(encoding),eltype(x)},ET}(
a => (b - 1) for (b, a) in enumerate(encoding)
)
deltas = eltype(x)[]
inds = Vector{ET}(undef, len)
for (j, val) in enumerate(x)
@inbounds inds[j] = get!(pool, val) do
push!(deltas, val)
return length(pool)
end
end
if !isempty(deltas)
if length(deltas) + length(encoding) > typemax(ET)
error(
"fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET",
)
end
data = arrowvector(
deltas,
i,
nl,
fi,
de,
ded,
nothing;
dictencode=dictencodenested,
dictencodenested=dictencodenested,
dictencoding=true,
kw...,
)
push!(
ded,
DictEncoding{eltype(data),ET,typeof(data)}(
id,
data,
false,
getmetadata(data),
),
)
if typeof(encoding.data) <: ChainedVector
append!(encoding.data, data)
else
data2 = ChainedVector([encoding.data, data])
encoding = DictEncoding{eltype(data2),ET,typeof(data2)}(
id,
data2,
false,
getmetadata(encoding),
)
de[id] = Lockable(encoding)
end
end
end
end
if meta !== nothing && getmetadata(encoding) !== nothing
meta = toidict(merge!(Dict(meta), Dict(getmetadata(encoding))))
elseif getmetadata(encoding) !== nothing
meta = getmetadata(encoding)
end
return DictEncoded(UInt8[], validity, inds, encoding, meta)
end
@propagate_inbounds function Base.getindex(d::DictEncoded, i::Integer)
@boundscheck checkbounds(d, i)
@inbounds valid = d.validity[i]
!valid && return missing
@inbounds idx = d.indices[i]
return @inbounds d.encoding[idx + 1]
end
@propagate_inbounds function Base.setindex!(d::DictEncoded{T}, v, i::Integer) where {T}
@boundscheck checkbounds(d, i)
if v === missing
@inbounds d.validity[i] = false
else
ix = findfirst(d.encoding.data, v)
if ix === nothing
push!(d.encoding.data, v)
@inbounds d.indices[i] = length(d.encoding.data) - 1
else
@inbounds d.indices[i] = ix - 1
end
end
return v
end
function Base.copy(x::DictEncoded{T,S}) where {T,S}
pool = copy(x.encoding.data)
valid = x.validity
inds = x.indices
refs = copy(inds)
@inbounds for i = 1:length(inds)
refs[i] = refs[i] + one(S)
end
return PooledArray(
PooledArrays.RefArray(refs),
Dict{T,S}(val => i for (i, val) in enumerate(pool)),
pool,
)
end
function compress(Z::Meta.CompressionType.T, comp, x::A) where {A<:DictEncoded}
len = length(x)
nc = nullcount(x)
validity = compress(Z, comp, x.validity)
inds = compress(Z, comp, x.indices)
return Compressed{Z,A}(x, [validity, inds], len, nc, Compressed[])
end
function DataAPI.levels(x::DictEncoded)
rp = DataAPI.refpool(x) # may contain missing values
Missing <: eltype(rp) || return rp
convert(AbstractArray{nonmissingtype(eltype(rp))}, deleteat!(rp, ismissing.(rp)))
end
function makenodesbuffers!(
col::DictEncoded{T,S},
fieldnodes,
fieldbuffers,
bufferoffset,
alignment,
) where {T,S}
len = length(col)
nc = nullcount(col)
push!(fieldnodes, FieldNode(len, nc))
@debug "made field node: nodeidx = $(length(fieldnodes)), col = $(typeof(col)), len = $(fieldnodes[end].length), nc = $(fieldnodes[end].null_count)"
# validity bitmap
blen = nc == 0 ? 0 : bitpackedbytes(len, alignment)
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
bufferoffset += blen
# indices
blen = sizeof(S) * len
push!(fieldbuffers, Buffer(bufferoffset, blen))
@debug "made field buffer: bufferidx = $(length(fieldbuffers)), offset = $(fieldbuffers[end].offset), len = $(fieldbuffers[end].length), padded = $(padding(fieldbuffers[end].length, alignment))"
bufferoffset += padding(blen, alignment)
return bufferoffset
end
DataAPI.refarray(x::DictEncoded{T,S}) where {T,S} = x.indices .+ one(S)
DataAPI.refpool(x::DictEncoded) = copy(x.encoding.data)
function writebuffer(io, col::DictEncoded, alignment)
@debug "writebuffer: col = $(typeof(col))"
@debug col
writebitmap(io, col, alignment)
# write indices
n = writearray(io, col.indices)
@debug "writing array: col = $(typeof(col.indices)), n = $n, padded = $(padding(n, alignment))"
writezeros(io, paddinglength(n, alignment))
return
end
================================================
FILE: src/arraytypes/fixedsizelist.jl
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for
gitextract_uitozoi5/
├── .JuliaFormatter.toml
├── .asf.yaml
├── .github/
│ ├── dependabot.yml
│ └── workflows/
│ ├── CompatHelper.yml
│ ├── TagBot.yml
│ ├── ci.yml
│ └── ci_nightly.yml
├── .gitignore
├── LICENSE
├── NOTICE
├── Project.toml
├── README.md
├── codecov.yaml
├── dev/
│ └── release/
│ ├── .dir-locals.el
│ ├── .gitignore
│ ├── README.md
│ ├── check_rat_report.py
│ ├── rat_exclude_files.txt
│ ├── release.sh
│ ├── release_rc.sh
│ ├── run_rat.sh
│ └── verify_rc.sh
├── docs/
│ ├── .gitignore
│ ├── Project.toml
│ ├── make.jl
│ └── src/
│ ├── index.md
│ ├── manual.md
│ └── reference.md
├── src/
│ ├── Arrow.jl
│ ├── ArrowTypes/
│ │ ├── LICENSE.md
│ │ ├── Project.toml
│ │ ├── src/
│ │ │ └── ArrowTypes.jl
│ │ └── test/
│ │ ├── Project.toml
│ │ ├── runtests.jl
│ │ └── tests.jl
│ ├── FlatBuffers/
│ │ ├── FlatBuffers.jl
│ │ ├── builder.jl
│ │ └── table.jl
│ ├── append.jl
│ ├── arraytypes/
│ │ ├── arraytypes.jl
│ │ ├── bool.jl
│ │ ├── compressed.jl
│ │ ├── dictencoding.jl
│ │ ├── fixedsizelist.jl
│ │ ├── list.jl
│ │ ├── map.jl
│ │ ├── primitive.jl
│ │ ├── struct.jl
│ │ ├── unions.jl
│ │ └── views.jl
│ ├── eltypes.jl
│ ├── metadata/
│ │ ├── File.jl
│ │ ├── Flatbuf.jl
│ │ ├── Message.jl
│ │ └── Schema.jl
│ ├── show.jl
│ ├── table.jl
│ ├── utils.jl
│ └── write.jl
└── test/
├── Project.toml
├── arrowjson/
│ ├── datetime.json
│ ├── decimal.json
│ ├── dictionary.json
│ ├── dictionary_unsigned.json
│ ├── map.json
│ ├── nested.json
│ ├── primitive-empty.json
│ ├── primitive.json
│ └── primitive_no_batches.json
├── arrowjson.jl
├── dates.jl
├── integrationtest.jl
├── java_compress_len_neg_one.arrow
├── java_compressed_zero_length.arrow
├── old_zdt.arrow
├── pyarrow_roundtrip.jl
├── reject_reason_trimmed.arrow
├── runtests.jl
├── testappend.jl
└── testtables.jl
Condensed preview — 80 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,451K chars).
[
{
"path": ".JuliaFormatter.toml",
"chars": 943,
"preview": "\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE "
},
{
"path": ".asf.yaml",
"chars": 1558,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".github/dependabot.yml",
"chars": 904,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".github/workflows/CompatHelper.yml",
"chars": 2191,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".github/workflows/TagBot.yml",
"chars": 1171,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".github/workflows/ci.yml",
"chars": 6865,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".github/workflows/ci_nightly.yml",
"chars": 3650,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": ".gitignore",
"chars": 882,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "LICENSE",
"chars": 11358,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "NOTICE",
"chars": 173,
"preview": "Apache Arrow Julia\nCopyright 2016-2025 The Apache Software Foundation\n\nThis product includes software developed at\nThe A"
},
{
"path": "Project.toml",
"chars": 2039,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "README.md",
"chars": 3703,
"preview": "<!---\n Licensed to the Apache Software Foundation (ASF) under one\n or more contributor license agreements. See the NO"
},
{
"path": "codecov.yaml",
"chars": 869,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "dev/release/.dir-locals.el",
"chars": 895,
"preview": ";;; Licensed to the Apache Software Foundation (ASF) under one\n;;; or more contributor license agreements. See the NOTI"
},
{
"path": "dev/release/.gitignore",
"chars": 838,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "dev/release/README.md",
"chars": 3817,
"preview": "<!---\n Licensed to the Apache Software Foundation (ASF) under one\n or more contributor license agreements. See the NO"
},
{
"path": "dev/release/check_rat_report.py",
"chars": 1831,
"preview": "#!/usr/bin/env python3\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agre"
},
{
"path": "dev/release/rat_exclude_files.txt",
"chars": 900,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "dev/release/release.sh",
"chars": 2279,
"preview": "#!/bin/bash\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. Se"
},
{
"path": "dev/release/release_rc.sh",
"chars": 4398,
"preview": "#!/bin/bash\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. Se"
},
{
"path": "dev/release/run_rat.sh",
"chars": 1695,
"preview": "#!/bin/bash\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. Se"
},
{
"path": "dev/release/verify_rc.sh",
"chars": 5412,
"preview": "#!/bin/bash\n#\n# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. Se"
},
{
"path": "docs/.gitignore",
"chars": 799,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "docs/Project.toml",
"chars": 919,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "docs/make.jl",
"chars": 1312,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "docs/src/index.md",
"chars": 908,
"preview": "```@raw html\n<!---\n Licensed to the Apache Software Foundation (ASF) under one\n or more contributor license agreements"
},
{
"path": "docs/src/manual.md",
"chars": 26781,
"preview": "```@raw html\n<!---\n Licensed to the Apache Software Foundation (ASF) under one\n or more contributor license agreements"
},
{
"path": "docs/src/reference.md",
"chars": 1086,
"preview": "```@raw html\n<!---\n Licensed to the Apache Software Foundation (ASF) under one\n or more contributor license agreements"
},
{
"path": "src/Arrow.jl",
"chars": 4982,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/LICENSE.md",
"chars": 779,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/Project.toml",
"chars": 1044,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/src/ArrowTypes.jl",
"chars": 21026,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/test/Project.toml",
"chars": 989,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/test/runtests.jl",
"chars": 860,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/ArrowTypes/test/tests.jl",
"chars": 10413,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/FlatBuffers/FlatBuffers.jl",
"chars": 1704,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/FlatBuffers/builder.jl",
"chars": 13846,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/FlatBuffers/table.jl",
"chars": 5483,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/append.jl",
"chars": 11620,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/arraytypes.jl",
"chars": 8710,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/bool.jl",
"chars": 4176,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/compressed.jl",
"chars": 3484,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/dictencoding.jl",
"chars": 14875,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/fixedsizelist.jl",
"chars": 6616,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/list.jl",
"chars": 8303,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/map.jl",
"chars": 5692,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/primitive.jl",
"chars": 4278,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/struct.jl",
"chars": 5419,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/unions.jl",
"chars": 12830,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/arraytypes/views.jl",
"chars": 2640,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/eltypes.jl",
"chars": 21619,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/metadata/File.jl",
"chars": 3701,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/metadata/Flatbuf.jl",
"chars": 906,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/metadata/Message.jl",
"chars": 8299,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/metadata/Schema.jl",
"chars": 24725,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/show.jl",
"chars": 2547,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/table.jl",
"chars": 38613,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/utils.jl",
"chars": 4756,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "src/write.jl",
"chars": 27193,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/Project.toml",
"chars": 2008,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/arrowjson/datetime.json",
"chars": 18268,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"f0\",\n \"type\": {\n \"name\": \"date\",\n \"uni"
},
{
"path": "test/arrowjson/decimal.json",
"chars": 771261,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"f0\",\n \"type\": {\n \"name\": \"decimal\",\n \""
},
{
"path": "test/arrowjson/dictionary.json",
"chars": 7694,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"dict0\",\n \"type\": {\n \"name\": \"utf8\"\n },\n "
},
{
"path": "test/arrowjson/dictionary_unsigned.json",
"chars": 5474,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"f0\",\n \"type\": {\n \"name\": \"utf8\"\n },\n "
},
{
"path": "test/arrowjson/map.json",
"chars": 6596,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"map_nullable\",\n \"type\": {\n \"name\": \"map\",\n "
},
{
"path": "test/arrowjson/nested.json",
"chars": 11143,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"list_nullable\",\n \"type\": {\n \"name\": \"list\"\n "
},
{
"path": "test/arrowjson/primitive-empty.json",
"chars": 18409,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"bool_nullable\",\n \"type\": {\n \"name\": \"bool\"\n "
},
{
"path": "test/arrowjson/primitive.json",
"chars": 44256,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"bool_nullable\",\n \"type\": {\n \"name\": \"bool\"\n "
},
{
"path": "test/arrowjson/primitive_no_batches.json",
"chars": 5981,
"preview": "{\n \"schema\": {\n \"fields\": [\n {\n \"name\": \"bool_nullable\",\n \"type\": {\n \"name\": \"bool\"\n "
},
{
"path": "test/arrowjson.jl",
"chars": 23945,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/dates.jl",
"chars": 2489,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/integrationtest.jl",
"chars": 1670,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/pyarrow_roundtrip.jl",
"chars": 2805,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/runtests.jl",
"chars": 39689,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/testappend.jl",
"chars": 5971,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
},
{
"path": "test/testtables.jl",
"chars": 11664,
"preview": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements. See the NOTICE f"
}
]
// ... and 4 more files (download for full content)
About this extraction
This page contains the full source code of the apache/arrow-julia GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 80 files (1.3 MB), approximately 363.1k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.