Showing preview only (841K chars total). Download the full file or copy to clipboard to get everything.
Repository: adelbertc/frameless
Branch: master
Commit: 6826375be4c0
Files: 206
Total size: 773.8 KB
Directory structure:
gitextract_u5s1eutc/
├── .github/
│ ├── release-drafter.yml
│ └── workflows/
│ ├── ci.yml
│ ├── clean.yml
│ └── release-drafter.yml
├── .gitignore
├── .scalafmt.conf
├── LICENSE
├── README.md
├── build.sbt
├── cats/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ └── frameless/
│ │ └── cats/
│ │ ├── FramelessSyntax.scala
│ │ ├── SparkDelayInstances.scala
│ │ ├── SparkTask.scala
│ │ ├── implicits.scala
│ │ └── package.scala
│ └── test/
│ ├── resources/
│ │ ├── log4j.properties
│ │ └── log4j2.properties
│ └── scala/
│ └── frameless/
│ └── cats/
│ ├── FramelessSyntaxTests.scala
│ └── test.scala
├── core/
│ └── src/
│ └── main/
│ └── scala/
│ └── frameless/
│ ├── CatalystAverageable.scala
│ ├── CatalystBitShift.scala
│ ├── CatalystBitwise.scala
│ ├── CatalystCast.scala
│ ├── CatalystCollection.scala
│ ├── CatalystDivisible.scala
│ ├── CatalystIsin.scala
│ ├── CatalystNaN.scala
│ ├── CatalystNotNullable.scala
│ ├── CatalystNumeric.scala
│ ├── CatalystNumericWithJavaBigDecimal.scala
│ ├── CatalystOrdered.scala
│ ├── CatalystPivotable.scala
│ ├── CatalystRound.scala
│ ├── CatalystSummable.scala
│ ├── CatalystVariance.scala
│ ├── Injection.scala
│ ├── SQLDate.scala
│ └── SQLTimestamp.scala
├── dataset/
│ └── src/
│ ├── main/
│ │ ├── scala/
│ │ │ ├── frameless/
│ │ │ │ ├── FramelessSyntax.scala
│ │ │ │ ├── InjectionEnum.scala
│ │ │ │ ├── IsValueClass.scala
│ │ │ │ ├── Job.scala
│ │ │ │ ├── RecordEncoder.scala
│ │ │ │ ├── SparkDelay.scala
│ │ │ │ ├── TypedColumn.scala
│ │ │ │ ├── TypedColumnMacroImpl.scala
│ │ │ │ ├── TypedDataset.scala
│ │ │ │ ├── TypedDatasetForwarded.scala
│ │ │ │ ├── TypedEncoder.scala
│ │ │ │ ├── TypedExpressionEncoder.scala
│ │ │ │ ├── With.scala
│ │ │ │ ├── functions/
│ │ │ │ │ ├── AggregateFunctions.scala
│ │ │ │ │ ├── Lit.scala
│ │ │ │ │ ├── NonAggregateFunctions.scala
│ │ │ │ │ ├── Udf.scala
│ │ │ │ │ ├── UnaryFunctions.scala
│ │ │ │ │ └── package.scala
│ │ │ │ ├── ops/
│ │ │ │ │ ├── AggregateTypes.scala
│ │ │ │ │ ├── As.scala
│ │ │ │ │ ├── ColumnTypes.scala
│ │ │ │ │ ├── GroupByOps.scala
│ │ │ │ │ ├── RelationalGroupsOps.scala
│ │ │ │ │ ├── Repeat.scala
│ │ │ │ │ └── SmartProject.scala
│ │ │ │ └── syntax/
│ │ │ │ └── package.scala
│ │ │ └── org/
│ │ │ └── apache/
│ │ │ └── spark/
│ │ │ └── sql/
│ │ │ ├── FramelessInternals.scala
│ │ │ └── reflection/
│ │ │ └── package.scala
│ │ ├── spark-3/
│ │ │ └── frameless/
│ │ │ └── MapGroups.scala
│ │ └── spark-3.4+/
│ │ └── frameless/
│ │ └── MapGroups.scala
│ └── test/
│ ├── resources/
│ │ ├── log4j.properties
│ │ └── log4j2.properties
│ ├── scala/
│ │ ├── frameless/
│ │ │ ├── AsTests.scala
│ │ │ ├── BitwiseTests.scala
│ │ │ ├── CastTests.scala
│ │ │ ├── ColTests.scala
│ │ │ ├── CollectTests.scala
│ │ │ ├── ColumnTests.scala
│ │ │ ├── ColumnViaLambdaTests.scala
│ │ │ ├── CreateTests.scala
│ │ │ ├── DropTest.scala
│ │ │ ├── DropTupledTest.scala
│ │ │ ├── EncoderTests.scala
│ │ │ ├── ExplodeTests.scala
│ │ │ ├── FilterTests.scala
│ │ │ ├── FlattenTests.scala
│ │ │ ├── GroupByTests.scala
│ │ │ ├── InjectionTests.scala
│ │ │ ├── IsValueClassTests.scala
│ │ │ ├── JobTests.scala
│ │ │ ├── JoinTests.scala
│ │ │ ├── LitTests.scala
│ │ │ ├── NumericTests.scala
│ │ │ ├── OrderByTests.scala
│ │ │ ├── RecordEncoderTests.scala
│ │ │ ├── SchemaTests.scala
│ │ │ ├── SelectTests.scala
│ │ │ ├── SelfJoinTests.scala
│ │ │ ├── TypedDatasetSuite.scala
│ │ │ ├── UdtEncodedClass.scala
│ │ │ ├── WithColumnTest.scala
│ │ │ ├── WithColumnTupledTest.scala
│ │ │ ├── XN.scala
│ │ │ ├── forward/
│ │ │ │ ├── CheckpointTests.scala
│ │ │ │ ├── ColumnsTests.scala
│ │ │ │ ├── CountTests.scala
│ │ │ │ ├── DistinctTests.scala
│ │ │ │ ├── ExceptTests.scala
│ │ │ │ ├── FirstTests.scala
│ │ │ │ ├── ForeachTests.scala
│ │ │ │ ├── HeadTests.scala
│ │ │ │ ├── InputFilesTests.scala
│ │ │ │ ├── IntersectTests.scala
│ │ │ │ ├── IsLocalTests.scala
│ │ │ │ ├── IsStreamingTests.scala
│ │ │ │ ├── LimitTests.scala
│ │ │ │ ├── QueryExecutionTests.scala
│ │ │ │ ├── RandomSplitTests.scala
│ │ │ │ ├── SQLContextTests.scala
│ │ │ │ ├── SparkSessionTests.scala
│ │ │ │ ├── StorageLevelTests.scala
│ │ │ │ ├── TakeTests.scala
│ │ │ │ ├── ToJSONTests.scala
│ │ │ │ ├── ToLocalIteratorTests.scala
│ │ │ │ ├── UnionTests.scala
│ │ │ │ ├── WriteStreamTests.scala
│ │ │ │ └── WriteTests.scala
│ │ │ ├── functions/
│ │ │ │ ├── AggregateFunctionsTests.scala
│ │ │ │ ├── DateTimeStringBehaviourUtils.scala
│ │ │ │ ├── DoubleBehaviourUtils.scala
│ │ │ │ ├── NonAggregateFunctionsTests.scala
│ │ │ │ ├── UdfTests.scala
│ │ │ │ └── UnaryFunctionsTest.scala
│ │ │ ├── ops/
│ │ │ │ ├── ColumnTypesTest.scala
│ │ │ │ ├── CubeTests.scala
│ │ │ │ ├── PivotTest.scala
│ │ │ │ ├── RepeatTest.scala
│ │ │ │ ├── RollupTests.scala
│ │ │ │ ├── SmartProjectTest.scala
│ │ │ │ └── deserialized/
│ │ │ │ ├── FilterTests.scala
│ │ │ │ ├── FlatMapTests.scala
│ │ │ │ ├── MapPartitionsTests.scala
│ │ │ │ ├── MapTests.scala
│ │ │ │ └── ReduceTests.scala
│ │ │ ├── package.scala
│ │ │ ├── sql/
│ │ │ │ ├── package.scala
│ │ │ │ └── rules/
│ │ │ │ └── SQLRulesSuite.scala
│ │ │ └── syntax/
│ │ │ └── FramelessSyntaxTests.scala
│ │ └── org/
│ │ └── apache/
│ │ └── hadoop/
│ │ └── fs/
│ │ └── local/
│ │ └── StreamingFS.scala
│ ├── spark-3.2/
│ │ └── frameless/
│ │ └── sql/
│ │ └── rules/
│ │ └── FramelessLitPushDownTests.scala
│ └── spark-3.3+/
│ └── frameless/
│ └── sql/
│ └── rules/
│ └── FramelessLitPushDownTests.scala
├── docs/
│ ├── Cats.md
│ ├── FeatureOverview.md
│ ├── Injection.md
│ ├── Job.md
│ ├── TypedDataFrame.md
│ ├── TypedDatasetVsSparkDataset.md
│ ├── TypedEncoder.md
│ ├── TypedML.md
│ ├── WorkingWithCsvParquetJson.md
│ ├── directory.conf
│ ├── iris.data
│ └── iris.parquet
├── github.sbt
├── ml/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ ├── frameless/
│ │ │ └── ml/
│ │ │ ├── TypedEstimator.scala
│ │ │ ├── TypedTransformer.scala
│ │ │ ├── classification/
│ │ │ │ └── TypedRandomForestClassifier.scala
│ │ │ ├── clustering/
│ │ │ │ ├── TypedBisectingKMeans.scala
│ │ │ │ └── TypedKMeans.scala
│ │ │ ├── feature/
│ │ │ │ ├── TypedIndexToString.scala
│ │ │ │ ├── TypedStringIndexer.scala
│ │ │ │ └── TypedVectorAssembler.scala
│ │ │ ├── internals/
│ │ │ │ ├── LinearInputsChecker.scala
│ │ │ │ ├── SelectorByValue.scala
│ │ │ │ ├── TreesInputsChecker.scala
│ │ │ │ ├── UnaryInputsChecker.scala
│ │ │ │ └── VectorInputsChecker.scala
│ │ │ ├── package.scala
│ │ │ ├── params/
│ │ │ │ ├── kmeans/
│ │ │ │ │ └── KMeansInitMode.scala
│ │ │ │ ├── linears/
│ │ │ │ │ ├── LossStrategy.scala
│ │ │ │ │ └── Solver.scala
│ │ │ │ └── trees/
│ │ │ │ └── FeatureSubsetStrategy.scala
│ │ │ └── regression/
│ │ │ ├── TypedLinearRegression.scala
│ │ │ └── TypedRandomForestRegressor.scala
│ │ └── org/
│ │ └── apache/
│ │ └── spark/
│ │ └── ml/
│ │ └── FramelessInternals.scala
│ └── test/
│ └── scala/
│ └── frameless/
│ └── ml/
│ ├── FramelessMlSuite.scala
│ ├── Generators.scala
│ ├── TypedEncoderInstancesTests.scala
│ ├── classification/
│ │ ├── ClassificationIntegrationTests.scala
│ │ └── TypedRandomForestClassifierTests.scala
│ ├── clustering/
│ │ ├── BisectingKMeansTests.scala
│ │ ├── ClusteringIntegrationTests.scala
│ │ └── KMeansTests.scala
│ ├── feature/
│ │ ├── TypedIndexToStringTests.scala
│ │ ├── TypedStringIndexerTests.scala
│ │ └── TypedVectorAssemblerTests.scala
│ └── regression/
│ ├── RegressionIntegrationTests.scala
│ ├── TypedLinearRegressionTests.scala
│ └── TypedRandomForestRegressorTests.scala
├── project/
│ ├── Common.scala
│ ├── build.properties
│ └── plugins.sbt
├── refined/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ └── frameless/
│ │ └── refined/
│ │ ├── RefinedFieldEncoders.scala
│ │ └── package.scala
│ └── test/
│ └── scala/
│ └── frameless/
│ └── RefinedFieldEncoderTests.scala
└── scripts/
├── docs-build.sh
├── docs-publish.sh
└── travis-publish.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/release-drafter.yml
================================================
name-template: 'v$NEXT_MINOR_VERSION'
tag-template: 'v$NEXT_MINOR_VERSION'
categories:
- title: 'Added'
labels:
- 'feature'
- title: 'Changed'
labels:
- 'enhancement'
- 'dependency-update'
- title: 'Fixed'
labels:
- 'fix'
- 'bug'
include-labels:
- 'feature'
- 'enhancement'
- 'dependency-update'
- 'fix'
- 'bug'
exclude-labels:
- 'skip-changelog'
- 'documentation'
- 'build/process improvement'
change-template: '- $TITLE [#$NUMBER](https://github.com/typelevel/frameless/pull/$NUMBER) (@$AUTHOR)'
template: |
$CHANGES
================================================
FILE: .github/workflows/ci.yml
================================================
# This file was automatically generated by sbt-github-actions using the
# githubWorkflowGenerate task. You should add and commit this file to
# your git repository. It goes without saying that you shouldn't edit
# this file by hand! Instead, if you wish to make changes, you should
# change your sbt build configuration to revise the workflow description
# to meet your needs, then regenerate this file.
name: Continuous Integration
on:
pull_request:
branches: ['**', '!update/**', '!pr/**']
push:
branches: ['**', '!update/**', '!pr/**']
tags: [v*]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SBT_OPTS: '-Xms1g -Xmx4g'
SPARK_LOCAL_IP: localhost
concurrency:
group: ${{ github.workflow }} @ ${{ github.ref }}
cancel-in-progress: true
jobs:
build:
name: Test
strategy:
matrix:
os: [ubuntu-22.04]
scala: [2.13, 2.12]
java: [temurin@8]
project: [root-spark33, root-spark34, root-spark35]
exclude:
- scala: 2.13
project: root-spark33
- scala: 2.13
project: root-spark34
runs-on: ${{ matrix.os }}
timeout-minutes: 60
steps:
- name: Checkout current branch (full)
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup sbt
uses: sbt/setup-sbt@v1
- name: Setup Java (temurin@8)
id: setup-java-temurin-8
if: matrix.java == 'temurin@8'
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 8
cache: sbt
- name: sbt update
if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
run: sbt +update
- name: Check that workflows are up to date
run: sbt githubWorkflowCheck
- name: Check formatting
if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' scalafmtCheckAll 'project /' scalafmtSbtCheck
- name: Test & Compute Coverage
run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' coverage test test/coverageReport
- name: Check binary compatibility
if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' mimaReportBinaryIssues
- name: Generate API documentation
if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' doc
- uses: codecov/codecov-action@v3
with:
flags: ${{ matrix.scala }}-${{ matrix.project }}
publish:
name: Publish Artifacts
needs: [build]
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/master')
strategy:
matrix:
os: [ubuntu-22.04]
java: [temurin@8]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout current branch (full)
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup sbt
uses: sbt/setup-sbt@v1
- name: Setup Java (temurin@8)
id: setup-java-temurin-8
if: matrix.java == 'temurin@8'
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 8
cache: sbt
- name: sbt update
if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
run: sbt +update
- name: Import signing key
if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE == ''
env:
PGP_SECRET: ${{ secrets.PGP_SECRET }}
PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
run: echo $PGP_SECRET | base64 -d -i - | gpg --import
- name: Import signing key and strip passphrase
if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE != ''
env:
PGP_SECRET: ${{ secrets.PGP_SECRET }}
PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
run: |
echo "$PGP_SECRET" | base64 -d -i - > /tmp/signing-key.gpg
echo "$PGP_PASSPHRASE" | gpg --pinentry-mode loopback --passphrase-fd 0 --import /tmp/signing-key.gpg
(echo "$PGP_PASSPHRASE"; echo; echo) | gpg --command-fd 0 --pinentry-mode loopback --change-passphrase $(gpg --list-secret-keys --with-colons 2> /dev/null | grep '^sec:' | cut --delimiter ':' --fields 5 | tail -n 1)
- name: Publish
env:
SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }}
run: sbt tlCiRelease
dependency-submission:
name: Submit Dependencies
if: github.event.repository.fork == false && github.event_name != 'pull_request'
strategy:
matrix:
os: [ubuntu-22.04]
java: [temurin@8]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout current branch (full)
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup sbt
uses: sbt/setup-sbt@v1
- name: Setup Java (temurin@8)
id: setup-java-temurin-8
if: matrix.java == 'temurin@8'
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 8
cache: sbt
- name: sbt update
if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
run: sbt +update
- name: Submit Dependencies
uses: scalacenter/sbt-dependency-submission@v2
with:
modules-ignore: root-spark33_2.13 root-spark33_2.12 docs_2.13 docs_2.12 root-spark34_2.13 root-spark34_2.12 root-spark35_2.13 root-spark35_2.12
configs-ignore: test scala-tool scala-doc-tool test-internal
site:
name: Generate Site
strategy:
matrix:
os: [ubuntu-22.04]
java: [temurin@11]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout current branch (full)
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup sbt
uses: sbt/setup-sbt@v1
- name: Setup Java (temurin@8)
id: setup-java-temurin-8
if: matrix.java == 'temurin@8'
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 8
cache: sbt
- name: sbt update
if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
run: sbt +update
- name: Setup Java (temurin@11)
id: setup-java-temurin-11
if: matrix.java == 'temurin@11'
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: 11
cache: sbt
- name: sbt update
if: matrix.java == 'temurin@11' && steps.setup-java-temurin-11.outputs.cache-hit == 'false'
run: sbt +update
- name: Generate site
run: sbt docs/tlSite
- name: Publish site
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/master'
uses: peaceiris/actions-gh-pages@v4.0.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: mdocs/target/docs/site
keep_files: true
================================================
FILE: .github/workflows/clean.yml
================================================
# This file was automatically generated by sbt-github-actions using the
# githubWorkflowGenerate task. You should add and commit this file to
# your git repository. It goes without saying that you shouldn't edit
# this file by hand! Instead, if you wish to make changes, you should
# change your sbt build configuration to revise the workflow description
# to meet your needs, then regenerate this file.
name: Clean
on: push
jobs:
delete-artifacts:
name: Delete Artifacts
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Delete artifacts
run: |
# Customize those three lines with your repository and credentials:
REPO=${GITHUB_API_URL}/repos/${{ github.repository }}
# A shortcut to call GitHub API.
ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; }
# A temporary file which receives HTTP response headers.
TMPFILE=/tmp/tmp.$$
# An associative array, key: artifact name, value: number of artifacts of that name.
declare -A ARTCOUNT
# Process all artifacts on this repository, loop on returned "pages".
URL=$REPO/actions/artifacts
while [[ -n "$URL" ]]; do
# Get current page, get response headers in a temporary file.
JSON=$(ghapi --dump-header $TMPFILE "$URL")
# Get URL of next page. Will be empty if we are at the last page.
URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*<//' -e 's/>.*//')
rm -f $TMPFILE
# Number of artifacts on this page:
COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') ))
# Loop on all artifacts on this page.
for ((i=0; $i < $COUNT; i++)); do
# Get name of artifact and count instances of this name.
name=$(jq <<<$JSON -r ".artifacts[$i].name?")
ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1))
id=$(jq <<<$JSON -r ".artifacts[$i].id?")
size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") ))
printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size
ghapi -X DELETE $REPO/actions/artifacts/$id
done
done
================================================
FILE: .github/workflows/release-drafter.yml
================================================
name: Release Drafter
on:
push:
branches:
- master
pull_request:
types: [opened, reopened, synchronize]
jobs:
update_release_draft:
runs-on: ubuntu-latest
steps:
- uses: release-drafter/release-drafter@v5.15.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .gitignore
================================================
*.class
*.log
# sbt specific
.bsp/
dist/*
target/
lib_managed/
src_managed/
project/boot/
project/plugins/project/
# Scala-IDE specific
.scala_dependencies
.cache
.classpath
.project
.worksheet/
bin/
.settings/
.ensime
.ensime_cache/
# IntelliJ specific
.idea
# OS X
.DS_Store
node_modules
# VSCode
.history
.metals
.vscode
.bloop
metals.sbt
================================================
FILE: .scalafmt.conf
================================================
version = 3.8.6
runner.dialect = scala213
newlines.beforeMultilineDef = keep
newlines.topLevelStatements = [before]
newlines.beforeCurlyLambdaParams = multilineWithCaseOnly
newlines.afterCurlyLambdaParams = squash
newlines.implicitParamListModifierForce = [after]
newlines.avoidForSimpleOverflow = [tooLong]
newlines.avoidInResultType = true
newlines.sometimesBeforeColonInMethodReturnType = false
newlines.beforeTypeBounds = keep
verticalMultiline.atDefnSite = true
verticalMultiline.arityThreshold = 10
spaces.inImportCurlyBraces = true
includeCurlyBraceInSelectChains = false
includeNoParensInSelectChains = false
optIn.breakChainOnFirstMethodDot = false
docstrings.style = Asterisk
docstrings.wrap = no
literals.long=Upper
literals.float=Upper
literals.double=Upper
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# Frameless
[](https://github.com/typelevel/frameless/actions/workflows/ci.yml)
[](https://codecov.io/gh/typelevel/frameless)
[](https://discord.gg/ZDZsxWcBJt)
[](https://search.maven.org/search?q=g:org.typelevel%20and%20frameless)
[](https://s01.oss.sonatype.org/content/repositories/snapshots/org/typelevel/frameless-core_2.12/)
Frameless is a Scala library for working with [Spark](http://spark.apache.org/) using more expressive types.
It consists of the following modules:
* `frameless-dataset` for a more strongly typed `Dataset`/`DataFrame` API
* `frameless-ml` for a more strongly typed Spark ML API based on `frameless-dataset`
* `frameless-cats` for using Spark's `RDD` API with [cats](https://github.com/typelevel/cats)
Note that while Frameless is still getting off the ground, it is very possible that breaking changes will be
made for at least the next few versions.
The Frameless project and contributors support the
[Typelevel](http://typelevel.org/) [Code of Conduct](http://typelevel.org/code-of-conduct.html) and want all its
associated channels (e.g. GitHub, Discord) to be a safe and friendly environment for contributing and learning.
## Versions and dependencies
The compatible versions of [Spark](http://spark.apache.org/) and
[cats](https://github.com/typelevel/cats) are as follows:
| Frameless | Spark | Cats | Cats-Effect | Scala |
|-----------|-----------------------------|----------|-------------|-------------|
| 0.16.0 | 3.5.0 / 3.4.0 / 3.3.0 | 2.x | 3.x | 2.12 / 2.13 |
| 0.15.0 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 |
| 0.14.1 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 |
| 0.14.0 | 3.3.0 / 3.2.2 / 3.1.3 | 2.x | 3.x | 2.12 / 2.13 |
| 0.13.0 | 3.3.0 / 3.2.2 / 3.1.3 | 2.x | 3.x | 2.12 / 2.13 |
| 0.12.0 | 3.2.1 / 3.1.3 / 3.0.3 | 2.x | 3.x | 2.12 / 2.13 |
| 0.11.1 | 3.2.0 / 3.1.2 / 3.0.1 | 2.x | 2.x | 2.12 / 2.13 |
| 0.11.0* | 3.2.0 / 3.1.2 / 3.0.1 | 2.x | 2.x | 2.12 / 2.13 |
| 0.10.1 | 3.1.0 | 2.x | 2.x | 2.12 |
| 0.9.0 | 3.0.0 | 1.x | 1.x | 2.12 |
| 0.8.0 | 2.4.0 | 1.x | 1.x | 2.11 / 2.12 |
| 0.7.0 | 2.3.1 | 1.x | 1.x | 2.11 |
| 0.6.1 | 2.3.0 | 1.x | 0.8 | 2.11 |
| 0.5.2 | 2.2.1 | 1.x | 0.8 | 2.11 |
| 0.4.1 | 2.2.0 | 1.x | 0.8 | 2.11 |
| 0.4.0 | 2.2.0 | 1.0.0-IF | 0.4 | 2.11 |
_\* 0.11.0 has broken Spark 3.1.2 and 3.0.1 artifacts published._
Starting 0.11 we introduced Spark cross published artifacts:
* By default, frameless artifacts depend on the most recent Spark version
* Suffix `-spark{major}{minor}` is added to artifacts that are released for the previous Spark version(s)
Artifact names examples:
* `frameless-dataset` (the latest Spark dependency)
* `frameless-dataset-spark33` (Spark 3.3.x dependency)
* `frameless-dataset-spark32` (Spark 3.2.x dependency)
Versions 0.5.x and 0.6.x have identical features. The first is compatible with Spark 2.2.1 and the second with 2.3.0.
The **only** dependency of the `frameless-dataset` module is on [shapeless](https://github.com/milessabin/shapeless) 2.3.2.
Therefore, depending on `frameless-dataset`, has a minimal overhead on your Spark's application jar.
Only the `frameless-cats` module depends on cats and cats-effect, so if you prefer to work just with `Datasets` and not with `RDD`s,
you may choose not to depend on `frameless-cats`.
Frameless intentionally **does not** have a compile dependency on Spark.
This essentially allows you to use any version of Frameless with any version of Spark.
The aforementioned table simply provides the versions of Spark we officially compile
and test Frameless with, but other versions may probably work as well.
### Breaking changes in 0.9
* Spark 3 introduces a new ExpressionEncoder approach, the schema for single value DataFrame's is now ["value"](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala#L270) not "_1".
## Why?
Frameless introduces a new Spark API, called `TypedDataset`.
The benefits of using `TypedDataset` compared to the standard Spark `Dataset` API are as follows:
* Typesafe columns referencing (e.g., no more runtime errors when accessing non-existing columns)
* Customizable, typesafe encoders (e.g., if a type does not have an encoder, it should not compile)
* Enhanced type signature for built-in functions (e.g., if you apply an arithmetic operation on a non-numeric column, you
get a compilation error)
* Typesafe casting and projections
Click [here](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html) for a
detailed comparison of `TypedDataset` with Spark's `Dataset` API.
## Documentation
* [TypedDataset: Feature Overview](http://typelevel.org/frameless/FeatureOverview.html)
* [Typed Spark ML](http://typelevel.org/frameless/TypedML.html)
* [Comparing TypedDatasets with Spark's Datasets](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html)
* [Typed Encoders in Frameless](http://typelevel.org/frameless/TypedEncoder.html)
* [Injection: Creating Custom Encoders](http://typelevel.org/frameless/Injection.html)
* [Job\[A\]](http://typelevel.org/frameless/Job.html)
* [Using Cats with RDDs](http://typelevel.org/frameless/Cats.html)
* [Proof of Concept: TypedDataFrame](http://typelevel.org/frameless/TypedDataFrame.html)
## Quick Start
Since the 0.9.x release, Frameless is compiled only against Scala 2.12.x.
To use Frameless in your project add the following in your `build.sbt` file as needed:
```scala
val framelessVersion = "<latest version>"
resolvers ++= Seq(
// for snapshot artifacts only
"s01-oss-sonatype" at "https://s01.oss.sonatype.org/content/repositories/snapshots"
)
libraryDependencies ++= List(
"org.typelevel" %% "frameless-dataset" % framelessVersion,
"org.typelevel" %% "frameless-ml" % framelessVersion,
"org.typelevel" %% "frameless-cats" % framelessVersion
)
```
An easy way to bootstrap a Frameless sbt project:
* if you have [Giter8][g8] installed then simply:
```bash
g8 imarios/frameless.g8
```
- with sbt >= 0.13.13:
```bash
sbt new imarios/frameless.g8
```
Typing `sbt console` inside your project will bring up a shell with Frameless
and all its dependencies loaded (including Spark).
## Need help?
Feel free to messages us on our [discord](https://discord.gg/ZDZsxWcBJt)
channel for any issues/questions.
## Development
We require at least _one_ sign-off (thumbs-up, +1, or similar) to merge pull requests. The current maintainers
(people who can merge pull requests) are:
* [adelbertc](https://github.com/adelbertc)
* [imarios](https://github.com/imarios)
* [kanterov](https://github.com/kanterov)
* [non](https://github.com/non)
* [OlivierBlanvillain](https://github.com/OlivierBlanvillain/)
### Testing
Frameless contains several property tests. To avoid `OutOfMemoryError`s, we
tune the default generator sizes. The following environment variables may
be set to adjust the size of generated collections in the `TypedDataSet` suite:
| Property | Default |
|-----------------------------|--------:|
| FRAMELESS_GEN_MIN_SIZE | 0 |
| FRAMELESS_GEN_SIZE_RANGE | 20 |
## License
Code is provided under the Apache 2.0 license available at <http://opensource.org/licenses/Apache-2.0>,
as well as in the LICENSE file. This is the same license used as Spark.
[g8]: http://www.foundweekends.org/giter8/
================================================
FILE: build.sbt
================================================
val sparkVersion = "3.5.8"
val spark34Version = "3.4.4"
val spark33Version = "3.3.4"
val catsCoreVersion = "2.13.0"
val catsEffectVersion = "3.7.0"
val catsMtlVersion = "1.6.0"
val scalatest = "3.2.20"
val scalatestplus = "3.1.0.0-RC2"
val shapeless = "2.3.13"
val scalacheck = "1.19.0"
val scalacheckEffect = "2.1.0"
val refinedVersion = "0.11.3"
val nakedFSVersion = "0.1.0"
val Scala212 = "2.12.20"
val Scala213 = "2.13.18"
ThisBuild / tlBaseVersion := "0.16"
ThisBuild / crossScalaVersions := Seq(Scala213, Scala212)
ThisBuild / scalaVersion := Scala212
ThisBuild / coverageScalacPluginVersion := "2.3.0"
lazy val root = project
.in(file("."))
.enablePlugins(NoPublishPlugin)
.settings(crossScalaVersions := Nil)
.aggregate(
`root-spark35`,
`root-spark34`,
`root-spark33`,
docs
)
lazy val `root-spark35` = project
.in(file(".spark35"))
.enablePlugins(NoPublishPlugin)
.aggregate(core, cats, dataset, refined, ml)
lazy val `root-spark34` = project
.in(file(".spark34"))
.enablePlugins(NoPublishPlugin)
.aggregate(
core,
`cats-spark34`,
`dataset-spark34`,
`refined-spark34`,
`ml-spark34`
)
lazy val `root-spark33` = project
.in(file(".spark33"))
.enablePlugins(NoPublishPlugin)
.aggregate(
core,
`cats-spark33`,
`dataset-spark33`,
`refined-spark33`,
`ml-spark33`
)
lazy val core =
project.settings(name := "frameless-core").settings(framelessSettings)
lazy val cats = project
.settings(name := "frameless-cats")
.settings(catsSettings)
.dependsOn(dataset % "test->test;compile->compile;provided->provided")
lazy val `cats-spark34` = project
.settings(name := "frameless-cats-spark34")
.settings(sourceDirectory := (cats / sourceDirectory).value)
.settings(catsSettings)
.settings(spark34Settings)
.dependsOn(
`dataset-spark34` % "test->test;compile->compile;provided->provided"
)
lazy val `cats-spark33` = project
.settings(name := "frameless-cats-spark33")
.settings(sourceDirectory := (cats / sourceDirectory).value)
.settings(catsSettings)
.settings(spark33Settings)
.dependsOn(
`dataset-spark33` % "test->test;compile->compile;provided->provided"
)
lazy val dataset = project
.settings(name := "frameless-dataset")
.settings(
Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "spark-3.4+"
)
.settings(
Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "spark-3.3+"
)
.settings(datasetSettings)
.settings(sparkDependencies(sparkVersion))
.dependsOn(core % "test->test;compile->compile")
lazy val `dataset-spark34` = project
.settings(name := "frameless-dataset-spark34")
.settings(sourceDirectory := (dataset / sourceDirectory).value)
.settings(
Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3.4+"
)
.settings(
Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+"
)
.settings(datasetSettings)
.settings(sparkDependencies(spark34Version))
.settings(spark34Settings)
.dependsOn(core % "test->test;compile->compile")
lazy val `dataset-spark33` = project
.settings(name := "frameless-dataset-spark33")
.settings(sourceDirectory := (dataset / sourceDirectory).value)
.settings(
Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3"
)
.settings(
Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+"
)
.settings(datasetSettings)
.settings(sparkDependencies(spark33Version))
.settings(spark33Settings)
.dependsOn(core % "test->test;compile->compile")
lazy val refined = project
.settings(name := "frameless-refined")
.settings(refinedSettings)
.dependsOn(dataset % "test->test;compile->compile;provided->provided")
lazy val `refined-spark34` = project
.settings(name := "frameless-refined-spark34")
.settings(sourceDirectory := (refined / sourceDirectory).value)
.settings(refinedSettings)
.settings(spark34Settings)
.dependsOn(
`dataset-spark34` % "test->test;compile->compile;provided->provided"
)
lazy val `refined-spark33` = project
.settings(name := "frameless-refined-spark33")
.settings(sourceDirectory := (refined / sourceDirectory).value)
.settings(refinedSettings)
.settings(spark33Settings)
.dependsOn(
`dataset-spark33` % "test->test;compile->compile;provided->provided"
)
lazy val ml = project
.settings(name := "frameless-ml")
.settings(mlSettings)
.settings(sparkMlDependencies(sparkVersion))
.dependsOn(
core % "test->test;compile->compile",
dataset % "test->test;compile->compile;provided->provided"
)
lazy val `ml-spark34` = project
.settings(name := "frameless-ml-spark34")
.settings(sourceDirectory := (ml / sourceDirectory).value)
.settings(mlSettings)
.settings(sparkMlDependencies(spark34Version))
.settings(spark34Settings)
.dependsOn(
core % "test->test;compile->compile",
`dataset-spark34` % "test->test;compile->compile;provided->provided"
)
lazy val `ml-spark33` = project
.settings(name := "frameless-ml-spark33")
.settings(sourceDirectory := (ml / sourceDirectory).value)
.settings(mlSettings)
.settings(sparkMlDependencies(spark33Version))
.settings(spark33Settings)
.dependsOn(
core % "test->test;compile->compile",
`dataset-spark33` % "test->test;compile->compile;provided->provided"
)
lazy val docs = project
.in(file("mdocs"))
.settings(framelessSettings)
.settings(scalacOptions --= Seq("-Xfatal-warnings", "-Ywarn-unused-import"))
.enablePlugins(TypelevelSitePlugin)
.settings(sparkDependencies(sparkVersion, Compile))
.settings(sparkMlDependencies(sparkVersion, Compile))
.settings(
addCompilerPlugin(
"org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full
),
scalacOptions += "-Ydelambdafy:inline",
libraryDependencies += "org.typelevel" %% "mouse" % "1.3.2"
)
.dependsOn(dataset, cats, ml)
def sparkDependencies(
sparkVersion: String,
scope: Configuration = Provided
) = Seq(
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion % scope,
"org.apache.spark" %% "spark-sql" % sparkVersion % scope
)
)
def sparkMlDependencies(sparkVersion: String, scope: Configuration = Provided) =
Seq(
libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion % scope
)
lazy val catsSettings = framelessSettings ++ Seq(
addCompilerPlugin(
"org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full
),
libraryDependencies ++= Seq(
"org.typelevel" %% "cats-core" % catsCoreVersion,
"org.typelevel" %% "cats-effect" % catsEffectVersion,
"org.typelevel" %% "cats-mtl" % catsMtlVersion,
"org.typelevel" %% "alleycats-core" % catsCoreVersion,
"org.typelevel" %% "scalacheck-effect" % scalacheckEffect % Test
)
)
lazy val datasetSettings =
framelessSettings ++ framelessTypedDatasetREPL ++ Seq(
mimaBinaryIssueFilters ++= {
import com.typesafe.tools.mima.core._
val imt = ProblemFilters.exclude[IncompatibleMethTypeProblem](_)
val mc = ProblemFilters.exclude[MissingClassProblem](_)
val dmm = ProblemFilters.exclude[DirectMissingMethodProblem](_)
// TODO: Remove have version bump
Seq(
imt("frameless.TypedEncoder.mapEncoder"),
imt("frameless.TypedEncoder.arrayEncoder"),
imt("frameless.RecordEncoderFields.deriveRecordCons"),
imt("frameless.RecordEncoderFields.deriveRecordLast"),
mc("frameless.functions.FramelessLit"),
mc(f"frameless.functions.FramelessLit$$"),
dmm("frameless.functions.package.litAggr"),
dmm("org.apache.spark.sql.FramelessInternals.column")
)
},
coverageExcludedPackages := "org.apache.spark.sql.reflection",
libraryDependencies += "com.globalmentor" % "hadoop-bare-naked-local-fs" % nakedFSVersion % Test exclude (
"org.apache.hadoop",
"hadoop-commons"
)
)
lazy val refinedSettings =
framelessSettings ++ framelessTypedDatasetREPL ++ Seq(
libraryDependencies += "eu.timepit" %% "refined" % refinedVersion
)
lazy val mlSettings = framelessSettings ++ framelessTypedDatasetREPL
lazy val scalac212Options = Seq(
"-Xlint:-missing-interpolator,-unused,_",
"-target:jvm-1.8",
"-deprecation",
"-encoding",
"UTF-8",
"-feature",
"-unchecked",
"-Xfatal-warnings",
"-Yno-adapted-args",
"-Ywarn-dead-code",
"-Ywarn-numeric-widen",
"-Ywarn-unused-import",
"-Ywarn-value-discard",
"-language:existentials",
"-language:implicitConversions",
"-language:higherKinds",
"-Xfuture",
"-Ypartial-unification"
)
lazy val scalac213Options = {
val exclusions = Set(
"-Yno-adapted-args",
"-Ywarn-unused-import",
"-Xfuture",
// type TraversableOnce in package scala is deprecated, symbol literal is deprecated; use Symbol("a") instead
"-Xfatal-warnings",
"-Ypartial-unification"
)
// https://github.com/scala/bug/issues/12072
val options = Seq("-Xlint:-byname-implicit")
scalac212Options.filter(s => !exclusions.contains(s)) ++ options
}
lazy val scalacOptionSettings = Def.setting {
def baseScalacOptions(scalaVersion: String) =
CrossVersion.partialVersion(scalaVersion) match {
case Some((2, 13)) => scalac213Options
case _ => scalac212Options
}
baseScalacOptions(scalaVersion.value)
}
lazy val framelessSettings = Seq(
scalacOptions ++= scalacOptionSettings.value,
Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oDF"),
libraryDependencies ++= Seq(
"com.chuusai" %% "shapeless" % shapeless,
"org.scalatest" %% "scalatest" % scalatest % Test,
"org.scalatestplus" %% "scalatestplus-scalacheck" % scalatestplus % Test,
"org.scalacheck" %% "scalacheck" % scalacheck % Test
),
Test / javaOptions ++= {
val baseOptions = Seq("-Xmx1G", "-ea")
val java17Options =
if (sys.props("java.specification.version").toDouble >= 17.0) {
Seq(
"--add-opens=java.base/java.lang=ALL-UNNAMED",
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
"--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
"--add-opens=java.base/java.io=ALL-UNNAMED",
"--add-opens=java.base/java.net=ALL-UNNAMED",
"--add-opens=java.base/java.nio=ALL-UNNAMED",
"--add-opens=java.base/java.util=ALL-UNNAMED",
"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
"--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
"--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
"--add-opens=java.base/sun.security.action=ALL-UNNAMED",
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"
)
} else Seq.empty
baseOptions ++ java17Options
},
Test / fork := true,
Test / parallelExecution := false,
mimaPreviousArtifacts ~= {
_.filterNot(_.revision == "0.11.0") // didn't release properly
},
/**
* The old Scala XML is pulled from Scala 2.12.x.
*
* [error] (update) found version conflict(s) in library dependencies; some are suspected to be binary incompatible:
* [error]
* [error] * org.scala-lang.modules:scala-xml_2.12:2.3.0 (early-semver) is selected over 1.0.6
* [error] +- org.scoverage:scalac-scoverage-reporter_2.12:2.0.7 (depends on 2.4.0)
* [error] +- org.scala-lang:scala-compiler:2.12.16 (depends on 1.0.6)
*/
libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
) ++ consoleSettings
lazy val spark34Settings = Seq[Setting[_]](
tlVersionIntroduced := Map("2.12" -> "0.14.1", "2.13" -> "0.14.1"),
mimaPreviousArtifacts := Set(
organization.value %% moduleName.value
.split("-")
.dropRight(1)
.mkString("-") % "0.14.1"
)
)
lazy val spark33Settings = Seq[Setting[_]](
tlVersionIntroduced := Map("2.12" -> "0.13.0", "2.13" -> "0.13.0"),
mimaPreviousArtifacts := Set(
organization.value %% moduleName.value
.split("-")
.dropRight(1)
.mkString("-") % "0.14.0"
)
)
lazy val consoleSettings = Seq(
Compile / console / scalacOptions ~= {
_.filterNot("-Ywarn-unused-import" == _)
},
Test / console / scalacOptions := (Compile / console / scalacOptions).value
)
lazy val framelessTypedDatasetREPL = Seq(
initialize ~= { _ => // Color REPL
val ansi = System.getProperty("sbt.log.noformat", "false") != "true"
if (ansi) System.setProperty("scala.color", "true")
},
console / initialCommands :=
"""
|import org.apache.spark.{SparkConf, SparkContext}
|import org.apache.spark.sql.SparkSession
|import frameless.functions.aggregate._
|import frameless.syntax._
|
|val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false")
|implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
|
|import spark.implicits._
|
|spark.sparkContext.setLogLevel("WARN")
|
|import frameless.TypedDataset
""".stripMargin,
console / cleanupCommands :=
"""
|spark.stop()
""".stripMargin
)
ThisBuild / organization := "org.typelevel"
ThisBuild / licenses := List(
"Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")
)
ThisBuild / developers := List(
"OlivierBlanvillain" -> "Olivier Blanvillain",
"adelbertc" -> "Adelbert Chang",
"imarios" -> "Marios Iliofotou",
"kanterov" -> "Gleb Kanterov",
"non" -> "Erik Osheim",
"jeremyrsmith" -> "Jeremy Smith",
"cchantep" -> "Cédric Chantepie",
"pomadchin" -> "Grigory Pomadchin"
).map {
case (username, fullName) =>
tlGitHubDev(username, fullName)
}
ThisBuild / tlCiReleaseBranches := Seq("master")
ThisBuild / tlSitePublishBranch := Some("master")
val roots = List("root-spark33", "root-spark34", "root-spark35")
ThisBuild / githubWorkflowBuildMatrixAdditions += "project" -> roots
ThisBuild / githubWorkflowBuildMatrixExclusions ++= roots.init.map { project =>
MatrixExclude(Map("scala" -> "2.13", "project" -> project))
}
ThisBuild / githubWorkflowEnv += "SBT_OPTS" -> "-Xms1g -Xmx4g"
================================================
FILE: cats/src/main/scala/frameless/cats/FramelessSyntax.scala
================================================
package frameless
package cats
import _root_.cats.effect.Sync
import _root_.cats.syntax.all._
import _root_.cats.mtl.Ask
import org.apache.spark.sql.SparkSession
trait FramelessSyntax extends frameless.FramelessSyntax {
implicit class SparkJobOps[F[_], A](fa: F[A])(implicit S: Sync[F], A: Ask[F, SparkSession]) {
import S._, A._
def withLocalProperty(key: String, value: String): F[A] =
for {
session <- ask
_ <- delay(session.sparkContext.setLocalProperty(key, value))
a <- fa
} yield a
def withGroupId(groupId: String): F[A] = withLocalProperty("spark.jobGroup.id", groupId)
def withDescription(description: String): F[A] = withLocalProperty("spark.job.description", description)
}
}
================================================
FILE: cats/src/main/scala/frameless/cats/SparkDelayInstances.scala
================================================
package frameless
package cats
import _root_.cats.effect.Sync
import org.apache.spark.sql.SparkSession
trait SparkDelayInstances {
implicit def framelessCatsSparkDelayForSync[F[_]](implicit S: Sync[F]): SparkDelay[F] = new SparkDelay[F] {
def delay[A](a: => A)(implicit spark: SparkSession): F[A] = S.delay(a)
}
}
================================================
FILE: cats/src/main/scala/frameless/cats/SparkTask.scala
================================================
package frameless
package cats
import _root_.cats.Id
import _root_.cats.data.Kleisli
import org.apache.spark.SparkContext
object SparkTask {
def apply[A](f: SparkContext => A): SparkTask[A] =
Kleisli[Id, SparkContext, A](f)
def pure[A](a: => A): SparkTask[A] =
Kleisli[Id, SparkContext, A](_ => a)
}
================================================
FILE: cats/src/main/scala/frameless/cats/implicits.scala
================================================
package frameless
package cats
import _root_.cats._
import _root_.cats.kernel.{CommutativeMonoid, CommutativeSemigroup}
import _root_.cats.syntax.all._
import alleycats.Empty
import scala.reflect.ClassTag
import org.apache.spark.rdd.RDD
object implicits extends FramelessSyntax with SparkDelayInstances {
implicit class rddOps[A: ClassTag](lhs: RDD[A]) {
def csum(implicit m: CommutativeMonoid[A]): A =
lhs.fold(m.empty)(_ |+| _)
def csumOption(implicit m: CommutativeSemigroup[A]): Option[A] =
lhs.aggregate[Option[A]](None)(
(acc, a) => Some(acc.fold(a)(_ |+| a)),
(l, r) => l.fold(r)(x => r.map(_ |+| x) orElse Some(x))
)
def cmin(implicit o: Order[A], e: Empty[A]): A = {
if (lhs.isEmpty()) e.empty
else lhs.reduce(_ min _)
}
def cminOption(implicit o: Order[A]): Option[A] =
csumOption(new CommutativeSemigroup[A] {
def combine(l: A, r: A) = l min r
})
def cmax(implicit o: Order[A], e: Empty[A]): A = {
if (lhs.isEmpty()) e.empty
else lhs.reduce(_ max _)
}
def cmaxOption(implicit o: Order[A]): Option[A] =
csumOption(new CommutativeSemigroup[A] {
def combine(l: A, r: A) = l max r
})
}
implicit class pairRddOps[K: ClassTag, V: ClassTag](lhs: RDD[(K, V)]) {
def csumByKey(implicit m: CommutativeSemigroup[V]): RDD[(K, V)] = lhs.reduceByKey(_ |+| _)
def cminByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ min _)
def cmaxByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ max _)
}
}
object union {
implicit def unionSemigroup[A]: Semigroup[RDD[A]] =
new Semigroup[RDD[A]] {
def combine(lhs: RDD[A], rhs: RDD[A]): RDD[A] = lhs union rhs
}
}
object inner {
implicit def pairwiseInnerSemigroup[K: ClassTag, V: ClassTag: Semigroup]: Semigroup[RDD[(K, V)]] =
new Semigroup[RDD[(K, V)]] {
def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] =
lhs.join(rhs).mapValues { case (x, y) => x |+| y }
}
}
object outer {
implicit def pairwiseOuterSemigroup[K: ClassTag, V: ClassTag](implicit m: Monoid[V]): Semigroup[RDD[(K, V)]] =
new Semigroup[RDD[(K, V)]] {
def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] =
lhs.fullOuterJoin(rhs).mapValues {
case (Some(x), Some(y)) => x |+| y
case (None, Some(y)) => y
case (Some(x), None) => x
case (None, None) => m.empty
}
}
}
================================================
FILE: cats/src/main/scala/frameless/cats/package.scala
================================================
package frameless
import _root_.cats.Id
import _root_.cats.data.Kleisli
import org.apache.spark.SparkContext
package object cats {
type SparkTask[A] = Kleisli[Id, SparkContext, A]
}
================================================
FILE: cats/src/test/resources/log4j.properties
================================================
log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR
log4j.logger.akka.event.slf4j=ERROR
log4j.logger.akka.remote.EndpointWriter=ERROR
log4j.logger.akka.remote.RemoteActorRefProvider$RemotingTerminator=ERROR
log4j.logger.com.anjuke.dm=ERROR
log4j.logger.io.netty.bootstrap.ServerBootstrap=ERROR
log4j.logger.io.netty.buffer.ByteBufUtil=ERROR
log4j.logger.io.netty.buffer.PooledByteBufAllocator=ERROR
log4j.logger.io.netty.channel.AbstractChannel=ERROR
log4j.logger.io.netty.channel.ChannelInitializer=ERROR
log4j.logger.io.netty.channel.ChannelOutboundBuffer=ERROR
log4j.logger.io.netty.channel.DefaultChannelPipeline=ERROR
log4j.logger.io.netty.channel.MultithreadEventLoopGroup=ERROR
log4j.logger.io.netty.channel.nio.AbstractNioChannel=ERROR
log4j.logger.io.netty.channel.nio.NioEventLoop=ERROR
log4j.logger.io.netty.channel.socket.nio.NioServerSocketChannel=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise.rejectedExecution=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise=ERROR
log4j.logger.io.netty.util.concurrent.GlobalEventExecutor=ERROR
log4j.logger.io.netty.util.concurrent.SingleThreadEventExecutor=ERROR
log4j.logger.io.netty.util.internal.logging.InternalLoggerFactory=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent0=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent=ERROR
log4j.logger.io.netty.util.internal.SystemPropertyUtil=ERROR
log4j.logger.io.netty.util.internal.ThreadLocalRandom=ERROR
log4j.logger.io.netty.util.NetUtil=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration=ERROR
log4j.logger.org.apache.hadoop.fs.FileSystem=ERROR
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=ERROR
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
log4j.logger.org.apache.hadoop.mapred.JobConf=ERROR
log4j.logger.org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedPartitioner=ERROR
log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.Interns=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MetricsSourceBuilder=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MutableMetricsFactory=ERROR
log4j.logger.org.apache.hadoop.security.authentication.util.KerberosName=ERROR
log4j.logger.org.apache.hadoop.security.Groups=ERROR
log4j.logger.org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback=ERROR
log4j.logger.org.apache.hadoop.security.SecurityUtil=ERROR
log4j.logger.org.apache.hadoop.security.ShellBasedUnixGroupsMapping=ERROR
log4j.logger.org.apache.hadoop.security.UserGroupInformation=ERROR
log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
log4j.logger.org.apache.hadoop.util.ShutdownHookManager=ERROR
log4j.logger.org.apache.spark.broadcast.TorrentBroadcast=ERROR
log4j.logger.org.apache.spark.ContextCleaner=ERROR
log4j.logger.org.apache.spark.executor.Executor=ERROR
log4j.logger.org.apache.spark.HeartbeatReceiver=ERROR
log4j.logger.org.apache.spark.HttpFileServer=ERROR
log4j.logger.org.apache.spark.HttpServer=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMaster=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.metrics.MetricsSystem=ERROR
log4j.logger.org.apache.spark.network.client.TransportClientFactory=ERROR
log4j.logger.org.apache.spark.network.netty.NettyBlockTransferService=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageDecoder=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageEncoder=ERROR
log4j.logger.org.apache.spark.network.server.OneForOneStreamManager=ERROR
log4j.logger.org.apache.spark.network.server.TransportServer=ERROR
log4j.logger.org.apache.spark.network.TransportContext=ERROR
log4j.logger.org.apache.spark.network.util.JavaUtils=ERROR
log4j.logger.org.apache.spark.rdd.CoGroupedRDD=ERROR
log4j.logger.org.apache.spark.rdd.SubtractedRDD=ERROR
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
log4j.logger.org.apache.spark.rpc.akka.AkkaRpcEnv$$anonfun$actorRef$lzycompute$1$1$$anon$1=ERROR
log4j.logger.org.apache.spark.scheduler.DAGScheduler=ERROR
log4j.logger.org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSchedulerImpl=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR
log4j.logger.org.apache.spark.SecurityManager=ERROR
log4j.logger.org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter=ERROR
log4j.logger.org.apache.spark.SparkContext=ERROR
log4j.logger.org.apache.spark.SparkEnv=ERROR
log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.SortBasedAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.TungstenAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.Exchange=ERROR
log4j.logger.org.apache.spark.sql.execution.joins.ShuffledHashOuterJoin=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$1=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$2=ERROR
log4j.logger.org.apache.spark.SSLOptions=ERROR
log4j.logger.org.apache.spark.storage.BlockManager=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerInfo=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMaster=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerSlaveEndpoint=ERROR
log4j.logger.org.apache.spark.storage.DiskBlockManager=ERROR
log4j.logger.org.apache.spark.storage.MemoryStore=ERROR
log4j.logger.org.apache.spark.storage.ShuffleBlockFetcherIterator=ERROR
log4j.logger.org.apache.spark.ui.SparkUI=ERROR
log4j.logger.org.apache.spark.unsafe.map.BytesToBytesMap=ERROR
log4j.logger.org.apache.spark.unsafe.memory.TaskMemoryManager=ERROR
log4j.logger.org.apache.spark.util.AkkaUtils=ERROR
log4j.logger.org.apache.spark.util.ClosureCleaner=ERROR
log4j.logger.org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter=ERROR
log4j.logger.org.apache.spark.util.Utils=ERROR
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.spark-project.jetty.http.AbstractGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.HttpGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.MimeTypes=ERROR
log4j.logger.org.spark-project.jetty.io.AbstractBuffer=ERROR
log4j.logger.org.spark-project.jetty.io.nio=ERROR
log4j.logger.org.spark-project.jetty.server.AbstractConnector=ERROR
log4j.logger.org.spark-project.jetty.server.bio.SocketConnector=ERROR
log4j.logger.org.spark-project.jetty.server.handler.AbstractHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandlerCollection=ERROR
log4j.logger.org.spark-project.jetty.server.handler.DefaultHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ErrorHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.GzipHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ResourceHandler=ERROR
log4j.logger.org.spark-project.jetty.server.Server=ERROR
log4j.logger.org.spark-project.jetty.server=ERROR
log4j.logger.org.spark-project.jetty.servlet.DefaultServlet=ERROR
log4j.logger.org.spark-project.jetty.servlet.Holder=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHandler=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHolder=ERROR
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.AggregateLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.Container=ERROR
log4j.logger.org.spark-project.jetty.util.IO=ERROR
log4j.logger.org.spark-project.jetty.util.log=ERROR
log4j.logger.org.spark-project.jetty.util.resource.FileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarFileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.Resource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.URLResource=ERROR
log4j.logger.org.spark-project.jetty.util.StringUtil=ERROR
log4j.logger.org.spark-project.jetty.util.thread.QueuedThreadPool=ERROR
log4j.logger.org.spark-project.jetty.util.thread.Timeout=ERROR
log4j.logger.org.spark-project.jetty=ERROR
log4j.logger.Remoting=ERROR
================================================
FILE: cats/src/test/resources/log4j2.properties
================================================
# Set to debug or trace if log4j initialization is failing
status = warn
# Name of the configuration
name = ConsoleAppender
# Console appender configuration
appender.console.type = Console
appender.console.name = consoleLogger
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{YYYY-MM-dd HH:mm:ss} [%t] %-5p %c:%L - %m%n
appender.console.target = SYSTEM_OUT
# Root logger level
rootLogger.level = error
# Root logger referring to console appender
rootLogger.appenderRef.stdout.ref = consoleLogger
logger.spark.name = org.apache.spark
logger.spark.level = warn
logger.hadoop.name = org.apache.hadoop
logger.hadoop.level = warn
================================================
FILE: cats/src/test/scala/frameless/cats/FramelessSyntaxTests.scala
================================================
package frameless
package cats
import _root_.cats.data.ReaderT
import _root_.cats.effect.IO
import _root_.cats.effect.unsafe.implicits.global
import org.apache.spark.sql.SparkSession
import org.scalatest.matchers.should.Matchers
import org.scalacheck.{Test => PTest}
import org.scalacheck.Prop, Prop._
import org.scalacheck.effect.PropF, PropF._
class FramelessSyntaxTests extends TypedDatasetSuite with Matchers {
override val sparkDelay = null
def prop[A, B](data: Vector[X2[A, B]])(
implicit ev: TypedEncoder[X2[A, B]]
): Prop = {
import implicits._
val dataset = TypedDataset.create(data).dataset
val dataframe = dataset.toDF()
val typedDataset = dataset.typed
val typedDatasetFromDataFrame = dataframe.unsafeTyped[X2[A, B]]
typedDataset.collect[IO]().unsafeRunSync().toVector ?= typedDatasetFromDataFrame.collect[IO]().unsafeRunSync().toVector
}
test("dataset typed - toTyped") {
check(forAll(prop[Int, String] _))
}
test("properties can be read back") {
import implicits._
import _root_.cats.syntax.all._
forAllF { (k: String, v: String) =>
val scopedKey = "frameless.tests." + k
1
.pure[ReaderT[IO, SparkSession, *]]
.withLocalProperty(scopedKey, v)
.withGroupId(v)
.withDescription(v)
.run(session)
.map { _ =>
sc.getLocalProperty(scopedKey) shouldBe v
sc.getLocalProperty("spark.jobGroup.id") shouldBe v
sc.getLocalProperty("spark.job.description") shouldBe v
}.void
}.check().unsafeRunSync().status shouldBe PTest.Passed
}
}
================================================
FILE: cats/src/test/scala/frameless/cats/test.scala
================================================
package frameless
package cats
import _root_.cats.Foldable
import _root_.cats.syntax.all._
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext => SC}
import org.scalatest.compatible.Assertion
import org.scalactic.anyvals.PosInt
import org.scalacheck.Arbitrary
import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
import Arbitrary._
import scala.collection.immutable.SortedMap
import scala.reflect.ClassTag
import org.scalatest.matchers.should.Matchers
import org.scalatest.propspec.AnyPropSpec
trait SparkTests {
val appID: String = new java.util.Date().toString + math.floor(math.random() * 10E4).toLong.toString
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("test")
.set("spark.ui.enabled", "false")
.set("spark.app.id", appID)
implicit def session: SparkSession = SparkSession.builder().config(conf).getOrCreate()
implicit def sc: SparkContext = session.sparkContext
implicit class seqToRdd[A: ClassTag](seq: Seq[A])(implicit sc: SC) {
def toRdd: RDD[A] = sc.makeRDD(seq)
}
}
object Tests {
def innerPairwise(mx: Map[String, Int], my: Map[String, Int], check: (Any, Any) => Assertion)(implicit sc: SC): Assertion = {
import frameless.cats.implicits._
import frameless.cats.inner._
val xs = sc.parallelize(mx.toSeq)
val ys = sc.parallelize(my.toSeq)
val mz0 = (xs |+| ys).collectAsMap()
val mz1 = (xs join ys).mapValues { case (x, y) => x |+| y }.collectAsMap()
val mz2 = (for { (k, x) <- mx; y <- my.get(k) } yield (k, x + y)).toMap
check(mz0, mz1)
check(mz1, mz2)
val zs = sc.parallelize(mx.values.toSeq)
check(xs.csumByKey.collectAsMap(), mx)
check(zs.csum, zs.collect().sum)
if (mx.nonEmpty) {
check(xs.cminByKey.collectAsMap(), mx)
check(xs.cmaxByKey.collectAsMap(), mx)
check(zs.cmin, zs.collect().min)
check(zs.cmax, zs.collect().max)
} else check(1, 1)
}
}
class Test extends AnyPropSpec with Matchers with ScalaCheckPropertyChecks with SparkTests {
implicit override val generatorDrivenConfig =
PropertyCheckConfiguration(minSize = PosInt(10))
property("spark is working") {
sc.parallelize(Seq(1, 2, 3)).collect() shouldBe Array(1,2,3)
}
property("inner pairwise monoid") {
// Make sure we have non-empty map
forAll { (xh: (String, Int), mx: Map[String, Int], yh: (String, Int), my: Map[String, Int]) =>
Tests.innerPairwise(mx + xh, my + yh, _ shouldBe _)
}
}
property("rdd simple numeric commutative semigroup") {
import frameless.cats.implicits._
forAll { seq: List[Int] =>
val expectedSum = if (seq.isEmpty) None else Some(seq.sum)
val expectedMin = if (seq.isEmpty) None else Some(seq.min)
val expectedMax = if (seq.isEmpty) None else Some(seq.max)
val rdd = seq.toRdd
rdd.cmin shouldBe expectedMin.getOrElse(0)
rdd.cminOption shouldBe expectedMin
rdd.cmax shouldBe expectedMax.getOrElse(0)
rdd.cmaxOption shouldBe expectedMax
rdd.csum shouldBe expectedSum.getOrElse(0)
rdd.csumOption shouldBe expectedSum
}
}
property("rdd of SortedMap[Int,Int] commutative monoid") {
import frameless.cats.implicits._
forAll { seq: List[SortedMap[Int, Int]] =>
val rdd = seq.toRdd
rdd.csum shouldBe Foldable[List].fold(seq)
}
}
property("rdd tuple commutative semigroup example") {
import frameless.cats.implicits._
forAll { seq: List[(Int, Int)] =>
val expectedSum = if (seq.isEmpty) None else Some(Foldable[List].fold(seq))
val rdd = seq.toRdd
rdd.csum shouldBe expectedSum.getOrElse(0 -> 0)
rdd.csumOption shouldBe expectedSum
}
}
property("pair rdd numeric commutative semigroup example") {
import frameless.cats.implicits._
val seq = Seq( ("a",2), ("b",3), ("d",6), ("b",2), ("d",1) )
val rdd = seq.toRdd
rdd.cminByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",2), ("d",1) )
rdd.cmaxByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",3), ("d",6) )
rdd.csumByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",5), ("d",7) )
}
}
================================================
FILE: core/src/main/scala/frameless/CatalystAverageable.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/**
* When averaging Spark doesn't change these types:
* - BigDecimal -> BigDecimal
* - Double -> Double
* But it changes these types :
* - Int -> Double
* - Short -> Double
* - Long -> Double
*/
@implicitNotFound("Cannot compute average of type ${In}.")
trait CatalystAverageable[In, Out]
object CatalystAverageable {
private[this] val theInstance = new CatalystAverageable[Any, Any] {}
private[this] def of[In, Out]: CatalystAverageable[In, Out] = theInstance.asInstanceOf[CatalystAverageable[In, Out]]
implicit val framelessAverageableBigDecimal: CatalystAverageable[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal]
implicit val framelessAverageableDouble: CatalystAverageable[Double, Double] = of[Double, Double]
implicit val framelessAverageableLong: CatalystAverageable[Long, Double] = of[Long, Double]
implicit val framelessAverageableInt: CatalystAverageable[Int, Double] = of[Int, Double]
implicit val framelessAverageableShort: CatalystAverageable[Short, Double] = of[Short, Double]
}
================================================
FILE: core/src/main/scala/frameless/CatalystBitShift.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Spark does not return always Int on shift
*/
@implicitNotFound("Cannot do bit shift operations on columns of type ${In}.")
trait CatalystBitShift[In, Out]
object CatalystBitShift {
private[this] val theInstance = new CatalystBitShift[Any, Any] {}
private[this] def of[In, Out]: CatalystBitShift[In, Out] = theInstance.asInstanceOf[CatalystBitShift[In, Out]]
implicit val framelessBitShiftBigDecimal: CatalystBitShift[BigDecimal, Int] = of[BigDecimal, Int]
implicit val framelessBitShiftDouble : CatalystBitShift[Byte, Int] = of[Byte, Int]
implicit val framelessBitShiftInt : CatalystBitShift[Short, Int] = of[Short, Int]
implicit val framelessBitShiftLong : CatalystBitShift[Int, Int] = of[Int, Int]
implicit val framelessBitShiftShort : CatalystBitShift[Long, Long] = of[Long, Long]
}
================================================
FILE: core/src/main/scala/frameless/CatalystBitwise.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/**
* Types that can be bitwise ORed, ANDed, or XORed by Catalyst.
* Note that Catalyst requires that when performing bitwise operations between columns
* the two types must be the same so in some cases casting is necessary.
*/
@implicitNotFound("Cannot do bitwise operations on columns of type ${A}.")
trait CatalystBitwise[A] extends CatalystNumeric[A]
object CatalystBitwise {
private[this] val theInstance = new CatalystBitwise[Any] {}
private[this] def of[A]: CatalystBitwise[A] =
theInstance.asInstanceOf[CatalystBitwise[A]]
implicit val framelessbyteBitwise: CatalystBitwise[Byte] = of[Byte]
implicit val framelessshortBitwise: CatalystBitwise[Short] = of[Short]
implicit val framelessintBitwise: CatalystBitwise[Int] = of[Int]
implicit val framelesslongBitwise: CatalystBitwise[Long] = of[Long]
}
================================================
FILE: core/src/main/scala/frameless/CatalystCast.scala
================================================
package frameless
trait CatalystCast[A, B]
object CatalystCast {
private[this] val theInstance = new CatalystCast[Any, Any] {}
private[this] def of[A, B]: CatalystCast[A, B] = theInstance.asInstanceOf[CatalystCast[A, B]]
implicit def framelessCastToString[T]: CatalystCast[T, String] = of[T, String]
implicit def framelessNumericToLong [A: CatalystNumeric]: CatalystCast[A, Long] = of[A, Long]
implicit def framelessNumericToInt [A: CatalystNumeric]: CatalystCast[A, Int] = of[A, Int]
implicit def framelessNumericToShort [A: CatalystNumeric]: CatalystCast[A, Short] = of[A, Short]
implicit def framelessNumericToByte [A: CatalystNumeric]: CatalystCast[A, Byte] = of[A, Byte]
implicit def framelessNumericToDecimal[A: CatalystNumeric]: CatalystCast[A, BigDecimal] = of[A, BigDecimal]
implicit def framelessNumericToDouble [A: CatalystNumeric]: CatalystCast[A, Double] = of[A, Double]
implicit def framelessBooleanToNumeric[A: CatalystNumeric]: CatalystCast[Boolean, A] = of[Boolean, A]
// doesn't make any sense to include:
// - sqlDateToBoolean: always None
// - sqlTimestampToBoolean: compares us to 0
implicit val framelessStringToBoolean : CatalystCast[String, Option[Boolean]] = of[String, Option[Boolean]]
implicit val framelessLongToBoolean : CatalystCast[Long, Boolean] = of[Long, Boolean]
implicit val framelessIntToBoolean : CatalystCast[Int, Boolean] = of[Int, Boolean]
implicit val framelessShortToBoolean : CatalystCast[Short, Boolean] = of[Short, Boolean]
implicit val framelessByteToBoolean : CatalystCast[Byte, Boolean] = of[Byte, Boolean]
implicit val framelessBigDecimalToBoolean: CatalystCast[BigDecimal, Boolean] = of[BigDecimal, Boolean]
implicit val framelessDoubleToBoolean : CatalystCast[Double, Boolean] = of[Double, Boolean]
// TODO
// needs verification, does it make sense to include? probably better as a separate function
// implicit object stringToInt extends CatalystCast[String, Option[Int]]
// implicit object stringToShort extends CatalystCast[String, Option[Short]]
// implicit object stringToByte extends CatalystCast[String, Option[Byte]]
// implicit object stringToDecimal extends CatalystCast[String, Option[BigDecimal]]
// implicit object stringToLong extends CatalystCast[String, Option[Long]]
// implicit object stringToSqlDate extends CatalystCast[String, Option[SQLDate]]
// needs verification:
//implicit object sqlTimestampToSqlDate extends CatalystCast[SQLTimestamp, SQLDate]
// needs verification:
// implicit object sqlTimestampToDecimal extends CatalystCast[SQLTimestamp, BigDecimal]
// implicit object sqlTimestampToLong extends CatalystCast[SQLTimestamp, Long]
// needs verification:
// implicit object stringToSqlTimestamp extends CatalystCast[String, SQLTimestamp]
// implicit object longToSqlTimestamp extends CatalystCast[Long, SQLTimestamp]
// implicit object intToSqlTimestamp extends CatalystCast[Int, SQLTimestamp]
// implicit object doubleToSqlTimestamp extends CatalystCast[Double, SQLTimestamp]
// implicit object floatToSqlTimestamp extends CatalystCast[Float, SQLTimestamp]
// implicit object bigDecimalToSqlTimestamp extends CatalystCast[BigDecimal, SQLTimestamp]
// implicit object sqlDateToSqlTimestamp extends CatalystCast[SQLDate, SQLTimestamp]
// doesn't make sense to include:
// - booleanToSqlTimestamp: 1L or 0L
// - shortToSqlTimestamp: ???
// - byteToSqlTimestamp: ???
// doesn't make sense to include:
// - sqlDateToLong: always None
// - sqlDateToInt: always None
// - sqlDateToInt: always None
// - sqlDateToInt: always None
// - sqlDateToInt: always None
// doesn't make sense to include:
// - sqlTimestampToInt: useful? can be done through `-> Long -> Int`
// - sqlTimestampToShort: useful? can be done through `-> Long -> Int`
// - sqlTimestampToShort: useful? can be done through `-> Long -> Int`
}
================================================
FILE: core/src/main/scala/frameless/CatalystCollection.scala
================================================
package frameless
import scala.annotation.implicitNotFound
@implicitNotFound("Cannot do collection operations on columns of type ${C}.")
trait CatalystCollection[C[_]]
object CatalystCollection {
private[this] val theInstance = new CatalystCollection[Any] {}
private[this] def of[A[_]]: CatalystCollection[A] = theInstance.asInstanceOf[CatalystCollection[A]]
implicit val arrayObject : CatalystCollection[Array] = of[Array]
implicit val seqObject : CatalystCollection[Seq] = of[Seq]
implicit val listObject : CatalystCollection[List] = of[List]
implicit val vectorObject: CatalystCollection[Vector] = of[Vector]
}
================================================
FILE: core/src/main/scala/frameless/CatalystDivisible.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Spark divides everything as Double, expect BigDecimals are divided into
* another BigDecimal, benefiting from some added precision.
*/
@implicitNotFound("Cannot compute division on type ${In}.")
trait CatalystDivisible[In, Out]
object CatalystDivisible {
private[this] val theInstance = new CatalystDivisible[Any, Any] {}
private[this] def of[In, Out]: CatalystDivisible[In, Out] = theInstance.asInstanceOf[CatalystDivisible[In, Out]]
implicit val framelessDivisibleBigDecimal: CatalystDivisible[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal]
implicit val framelessDivisibleDouble : CatalystDivisible[Double, Double] = of[Double, Double]
implicit val framelessDivisibleInt : CatalystDivisible[Int, Double] = of[Int, Double]
implicit val framelessDivisibleLong : CatalystDivisible[Long, Double] = of[Long, Double]
implicit val framelessDivisibleByte : CatalystDivisible[Byte, Double] = of[Byte, Double]
implicit val framelessDivisibleShort : CatalystDivisible[Short, Double] = of[Short, Double]
}
================================================
FILE: core/src/main/scala/frameless/CatalystIsin.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Types for which we can check if is in */
@implicitNotFound("Cannot do isin operation on columns of type ${A}.")
trait CatalystIsin[A]
object CatalystIsin {
implicit object framelessBigDecimal extends CatalystIsin[BigDecimal]
implicit object framelessByte extends CatalystIsin[Byte]
implicit object framelessDouble extends CatalystIsin[Double]
implicit object framelessFloat extends CatalystIsin[Float]
implicit object framelessInt extends CatalystIsin[Int]
implicit object framelessLong extends CatalystIsin[Long]
implicit object framelessShort extends CatalystIsin[Short]
implicit object framelesssString extends CatalystIsin[String]
}
================================================
FILE: core/src/main/scala/frameless/CatalystNaN.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Spark does NaN check only for these types */
@implicitNotFound("Columns of type ${A} cannot be NaN.")
trait CatalystNaN[A]
object CatalystNaN {
private[this] val theInstance = new CatalystNaN[Any] {}
private[this] def of[A]: CatalystNaN[A] = theInstance.asInstanceOf[CatalystNaN[A]]
implicit val framelessFloatNaN : CatalystNaN[Float] = of[Float]
implicit val framelessDoubleNaN : CatalystNaN[Double] = of[Double]
}
================================================
FILE: core/src/main/scala/frameless/CatalystNotNullable.scala
================================================
package frameless
import scala.annotation.implicitNotFound
@implicitNotFound("Cannot find evidence that type ${A} is nullable. Currently, only Option[A] is nullable.")
trait CatalystNullable[A]
object CatalystNullable {
implicit def optionIsNullable[A]: CatalystNullable[Option[A]] = new CatalystNullable[Option[A]] {}
}
@implicitNotFound("Cannot find evidence that type ${A} is not nullable.")
trait NotCatalystNullable[A]
object NotCatalystNullable {
implicit def everythingIsNotNullable[A]: NotCatalystNullable[A] = new NotCatalystNullable[A] {}
implicit def nullableIsNotNotNullable[A: CatalystNullable]: NotCatalystNullable[A] = new NotCatalystNullable[A] {}
}
================================================
FILE: core/src/main/scala/frameless/CatalystNumeric.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Types that can be added, subtracted and multiplied by Catalyst. */
@implicitNotFound("Cannot do numeric operations on columns of type ${A}.")
trait CatalystNumeric[A]
object CatalystNumeric {
private[this] val theInstance = new CatalystNumeric[Any] {}
private[this] def of[A]: CatalystNumeric[A] = theInstance.asInstanceOf[CatalystNumeric[A]]
implicit val framelessbigDecimalNumeric: CatalystNumeric[BigDecimal] = of[BigDecimal]
implicit val framelessbyteNumeric : CatalystNumeric[Byte] = of[Byte]
implicit val framelessdoubleNumeric : CatalystNumeric[Double] = of[Double]
implicit val framelessintNumeric : CatalystNumeric[Int] = of[Int]
implicit val framelesslongNumeric : CatalystNumeric[Long] = of[Long]
implicit val framelessshortNumeric : CatalystNumeric[Short] = of[Short]
}
================================================
FILE: core/src/main/scala/frameless/CatalystNumericWithJavaBigDecimal.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Spark does not return always the same type as the input was for example abs
*/
@implicitNotFound("Cannot compute on type ${In}.")
trait CatalystNumericWithJavaBigDecimal[In, Out]
object CatalystNumericWithJavaBigDecimal {
private[this] val theInstance = new CatalystNumericWithJavaBigDecimal[Any, Any] {}
private[this] def of[In, Out]: CatalystNumericWithJavaBigDecimal[In, Out] = theInstance.asInstanceOf[CatalystNumericWithJavaBigDecimal[In, Out]]
implicit val framelessAbsoluteBigDecimal: CatalystNumericWithJavaBigDecimal[BigDecimal, java.math.BigDecimal] = of[BigDecimal, java.math.BigDecimal]
implicit val framelessAbsoluteDouble : CatalystNumericWithJavaBigDecimal[Double, Double] = of[Double, Double]
implicit val framelessAbsoluteInt : CatalystNumericWithJavaBigDecimal[Int, Int] = of[Int, Int]
implicit val framelessAbsoluteLong : CatalystNumericWithJavaBigDecimal[Long, Long] = of[Long, Long]
implicit val framelessAbsoluteShort : CatalystNumericWithJavaBigDecimal[Short, Short] = of[Short, Short]
implicit val framelessAbsoluteByte : CatalystNumericWithJavaBigDecimal[Byte, Byte] = of[Byte, Byte]
}
================================================
FILE: core/src/main/scala/frameless/CatalystOrdered.scala
================================================
package frameless
import scala.annotation.implicitNotFound
import shapeless.{Generic, HList, Lazy}
import shapeless.ops.hlist.LiftAll
import java.time.{Duration, Instant, Period}
/** Types that can be ordered/compared by Catalyst. */
@implicitNotFound("Cannot compare columns of type ${A}.")
trait CatalystOrdered[A]
object CatalystOrdered {
private[this] val theInstance = new CatalystOrdered[Any] {}
private[this] def of[A]: CatalystOrdered[A] = theInstance.asInstanceOf[CatalystOrdered[A]]
implicit val framelessIntOrdered : CatalystOrdered[Int] = of[Int]
implicit val framelessBooleanOrdered : CatalystOrdered[Boolean] = of[Boolean]
implicit val framelessByteOrdered : CatalystOrdered[Byte] = of[Byte]
implicit val framelessShortOrdered : CatalystOrdered[Short] = of[Short]
implicit val framelessLongOrdered : CatalystOrdered[Long] = of[Long]
implicit val framelessFloatOrdered : CatalystOrdered[Float] = of[Float]
implicit val framelessDoubleOrdered : CatalystOrdered[Double] = of[Double]
implicit val framelessBigDecimalOrdered : CatalystOrdered[BigDecimal] = of[BigDecimal]
implicit val framelessSQLDateOrdered : CatalystOrdered[SQLDate] = of[SQLDate]
implicit val framelessSQLTimestampOrdered: CatalystOrdered[SQLTimestamp] = of[SQLTimestamp]
implicit val framelessStringOrdered : CatalystOrdered[String] = of[String]
implicit val framelessInstantOrdered : CatalystOrdered[Instant] = of[Instant]
implicit val framelessDurationOrdered : CatalystOrdered[Duration] = of[Duration]
implicit val framelessPeriodOrdered : CatalystOrdered[Period] = of[Period]
implicit def injectionOrdered[A, B]
(implicit
i0: Injection[A, B],
i1: CatalystOrdered[B]
): CatalystOrdered[A] = of[A]
implicit def deriveGeneric[G, H <: HList]
(implicit
i0: Generic.Aux[G, H],
i1: Lazy[LiftAll[CatalystOrdered, H]]
): CatalystOrdered[G] = of[G]
}
================================================
FILE: core/src/main/scala/frameless/CatalystPivotable.scala
================================================
package frameless
import scala.annotation.implicitNotFound
@implicitNotFound("Cannot pivot on type ${A}. Currently supported types to pivot are {Int, Long, Boolean, and String}.")
trait CatalystPivotable[A]
object CatalystPivotable {
private[this] val theInstance = new CatalystPivotable[Any] {}
private[this] def of[A]: CatalystPivotable[A] = theInstance.asInstanceOf[CatalystPivotable[A]]
implicit val framelessIntPivotable : CatalystPivotable[Int] = of[Int]
implicit val framelessLongPivotable : CatalystPivotable[Long] = of[Long]
implicit val framelessBooleanPivotable: CatalystPivotable[Boolean] = of[Boolean]
implicit val framelessStringPivotable : CatalystPivotable[String] = of[String]
}
================================================
FILE: core/src/main/scala/frameless/CatalystRound.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/** Spark does not return always long on round
*/
@implicitNotFound("Cannot compute round on type ${In}.")
trait CatalystRound[In, Out]
object CatalystRound {
private[this] val theInstance = new CatalystRound[Any, Any] {}
private[this] def of[In, Out]: CatalystRound[In, Out] = theInstance.asInstanceOf[CatalystRound[In, Out]]
implicit val framelessBigDecimal: CatalystRound[BigDecimal, java.math.BigDecimal] = of[BigDecimal, java.math.BigDecimal]
implicit val framelessDouble : CatalystRound[Double, Long] = of[Double, Long]
implicit val framelessInt : CatalystRound[Int, Long] = of[Int, Long]
implicit val framelessLong : CatalystRound[Long, Long] = of[Long, Long]
implicit val framelessShort : CatalystRound[Short, Long] = of[Short, Long]
}
================================================
FILE: core/src/main/scala/frameless/CatalystSummable.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/**
* When summing Spark doesn't change these types:
* - Long -> Long
* - BigDecimal -> BigDecimal
* - Double -> Double
*
* For other types there are conversions:
* - Int -> Long
* - Short -> Long
*/
@implicitNotFound("Cannot compute sum of type ${In}.")
trait CatalystSummable[In, Out] {
def zero: In
}
object CatalystSummable {
def apply[In, Out](zero: In): CatalystSummable[In, Out] = {
val _zero = zero
new CatalystSummable[In, Out] { val zero: In = _zero }
}
implicit val framelessSummableLong : CatalystSummable[Long, Long] = CatalystSummable(zero = 0L)
implicit val framelessSummableBigDecimal: CatalystSummable[BigDecimal, BigDecimal] = CatalystSummable(zero = BigDecimal(0))
implicit val framelessSummableDouble : CatalystSummable[Double, Double] = CatalystSummable(zero = 0.0)
implicit val framelessSummableInt : CatalystSummable[Int, Long] = CatalystSummable(zero = 0)
implicit val framelessSummableShort : CatalystSummable[Short, Long] = CatalystSummable(zero = 0)
}
================================================
FILE: core/src/main/scala/frameless/CatalystVariance.scala
================================================
package frameless
import scala.annotation.implicitNotFound
/**
* Spark's variance and stddev functions always return Double
*/
@implicitNotFound("Cannot compute variance on type ${A}.")
trait CatalystVariance[A]
object CatalystVariance {
private[this] val theInstance = new CatalystVariance[Any] {}
private[this] def of[A]: CatalystVariance[A] = theInstance.asInstanceOf[CatalystVariance[A]]
implicit val framelessIntVariance : CatalystVariance[Int] = of[Int]
implicit val framelessLongVariance : CatalystVariance[Long] = of[Long]
implicit val framelessShortVariance : CatalystVariance[Short] = of[Short]
implicit val framelessBigDecimalVariance: CatalystVariance[BigDecimal] = of[BigDecimal]
implicit val framelessDoubleVariance : CatalystVariance[Double] = of[Double]
}
================================================
FILE: core/src/main/scala/frameless/Injection.scala
================================================
package frameless
/**
* An Injection[A, B] is a reversible function from A to B.
*
* Must obey `forAll { a: A => invert(apply(a)) == a }`.
*/
trait Injection[A, B] extends Serializable {
def apply(a: A): B
def invert(b: B): A
}
object Injection {
def apply[A, B](f: A => B, g: B => A): Injection[A, B] = new Injection[A, B] {
def apply(a: A): B = f(a)
def invert(b: B): A = g(b)
}
}
================================================
FILE: core/src/main/scala/frameless/SQLDate.scala
================================================
package frameless
/**
* Type for the internal Spark representation of SQL date. If the `spark.sql.functions` where typed,
* [date_add][1] would for instance be defined as `def date_add(d: SQLDate, i: Int); SQLDate`.
*
* [1]: https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/functions.html#add_months(org.apache.spark.sql.Column,%20int)
*/
case class SQLDate(days: Int)
================================================
FILE: core/src/main/scala/frameless/SQLTimestamp.scala
================================================
package frameless
/**
* Type for the Spark internal representation of a timestamp. If the `spark.sql.functions` where typed,
* [current_timestamp][1] would for instance be defined as `def current_timestamp(): SQLTimestamp`.
*
* [1]: https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html#current_timestamp()
*/
case class SQLTimestamp(us: Long)
================================================
FILE: dataset/src/main/scala/frameless/FramelessSyntax.scala
================================================
package frameless
import org.apache.spark.sql.{Column, DataFrame, Dataset}
trait FramelessSyntax {
implicit class ColumnSyntax(self: Column) {
def typedColumn[T, U: TypedEncoder]: TypedColumn[T, U] = new TypedColumn[T, U](self)
def typedAggregate[T, U: TypedEncoder]: TypedAggregate[T, U] = new TypedAggregate[T, U](self)
}
implicit class DatasetSyntax[T: TypedEncoder](self: Dataset[T]) {
def typed: TypedDataset[T] = TypedDataset.create[T](self)
}
implicit class DataframeSyntax(self: DataFrame){
def unsafeTyped[T: TypedEncoder]: TypedDataset[T] = TypedDataset.createUnsafe(self)
}
}
================================================
FILE: dataset/src/main/scala/frameless/InjectionEnum.scala
================================================
package frameless
import shapeless._
trait InjectionEnum {
implicit val cnilInjectionEnum: Injection[CNil, String] =
Injection(
// $COVERAGE-OFF$No value of type CNil so impossible to test
_ => throw new Exception("Impossible"),
// $COVERAGE-ON$
name =>
throw new IllegalArgumentException(
s"Cannot construct a value of type CNil: $name did not match data constructor names"
)
)
implicit def coproductInjectionEnum[H, T <: Coproduct](
implicit
typeable: Typeable[H] ,
gen: Generic.Aux[H, HNil],
tInjectionEnum: Injection[T, String]
): Injection[H :+: T, String] = {
val dataConstructorName = typeable.describe.takeWhile(_ != '.')
Injection(
{
case Inl(_) => dataConstructorName
case Inr(t) => tInjectionEnum.apply(t)
},
{ name =>
if (name == dataConstructorName)
Inl(gen.from(HNil))
else
Inr(tInjectionEnum.invert(name))
}
)
}
implicit def genericInjectionEnum[A, R](
implicit
gen: Generic.Aux[A, R],
rInjectionEnum: Injection[R, String]
): Injection[A, String] =
Injection(
value => rInjectionEnum(gen.to(value)),
name => gen.from(rInjectionEnum.invert(name))
)
}
================================================
FILE: dataset/src/main/scala/frameless/IsValueClass.scala
================================================
package frameless
import shapeless._
import shapeless.labelled.FieldType
/** Evidence that `T` is a Value class */
@annotation.implicitNotFound(msg = "${T} is not a Value class")
final class IsValueClass[T] private() {}
object IsValueClass {
/** Provides an evidence `A` is a Value class */
implicit def apply[A <: AnyVal, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil]](
implicit
i0: LabelledGeneric.Aux[A, G],
i1: DropUnitValues.Aux[G, H]): IsValueClass[A] = new IsValueClass[A]
}
================================================
FILE: dataset/src/main/scala/frameless/Job.scala
================================================
package frameless
import org.apache.spark.sql.SparkSession
sealed abstract class Job[A](implicit spark: SparkSession) { self =>
/** Runs a new Spark job. */
def run(): A
def withGroupId(groupId: String): Job[A] = {
withLocalProperty("spark.jobGroup.id", groupId)
}
def withDescription(groupId: String): Job[A] = {
withLocalProperty("spark.job.description", groupId)
}
def withLocalProperty(key: String, value: String): Job[A] = {
new Job[A] {
def run(): A = {
spark.sparkContext.setLocalProperty(key, value)
self.run()
}
}
}
def map[B](fn: A => B): Job[B] = new Job[B]()(spark) {
def run(): B = fn(Job.this.run())
}
def flatMap[B](fn: A => Job[B]): Job[B] = new Job[B]()(spark) {
def run(): B = fn(Job.this.run()).run()
}
}
object Job {
def apply[A](a: => A)(implicit spark: SparkSession): Job[A] = new Job[A] {
def run(): A = a
}
implicit val framelessSparkDelayForJob: SparkDelay[Job] = new SparkDelay[Job] {
def delay[A](a: => A)(implicit spark: SparkSession): Job[A] = Job(a)
}
}
================================================
FILE: dataset/src/main/scala/frameless/RecordEncoder.scala
================================================
package frameless
import org.apache.spark.sql.FramelessInternals
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects.{
Invoke, NewInstance, UnwrapOption, WrapOption
}
import org.apache.spark.sql.types._
import shapeless._
import shapeless.labelled.FieldType
import shapeless.ops.hlist.IsHCons
import shapeless.ops.record.Keys
import scala.reflect.ClassTag
case class RecordEncoderField(
ordinal: Int,
name: String,
encoder: TypedEncoder[_]
)
trait RecordEncoderFields[T <: HList] extends Serializable {
def value: List[RecordEncoderField]
override def toString: String =
s"""RecordEncoderFields${value.mkString("[", ", ", "]")}"""
}
object RecordEncoderFields {
implicit def deriveRecordLast[K <: Symbol, H]
(implicit
key: Witness.Aux[K],
head: RecordFieldEncoder[H]
): RecordEncoderFields[FieldType[K, H] :: HNil] = new RecordEncoderFields[FieldType[K, H] :: HNil] {
def value: List[RecordEncoderField] = fieldEncoder[K, H] :: Nil
}
implicit def deriveRecordCons[K <: Symbol, H, T <: HList]
(implicit
key: Witness.Aux[K],
head: RecordFieldEncoder[H],
tail: RecordEncoderFields[T]
): RecordEncoderFields[FieldType[K, H] :: T] = new RecordEncoderFields[FieldType[K, H] :: T] {
def value: List[RecordEncoderField] =
fieldEncoder[K, H] :: tail.value.map(x => x.copy(ordinal = x.ordinal + 1))
}
private def fieldEncoder[K <: Symbol, H](implicit key: Witness.Aux[K], e: RecordFieldEncoder[H]): RecordEncoderField = RecordEncoderField(0, key.value.name, e.encoder)
}
/**
* Assists the generation of constructor call parameters from a labelled generic representation.
* As Unit typed fields were removed earlier, we need to put back unit literals in the appropriate positions.
*
* @tparam T labelled generic representation of type fields
*/
trait NewInstanceExprs[T <: HList] extends Serializable {
def from(exprs: List[Expression]): Seq[Expression]
}
object NewInstanceExprs {
implicit def deriveHNil: NewInstanceExprs[HNil] = new NewInstanceExprs[HNil] {
def from(exprs: List[Expression]): Seq[Expression] = Nil
}
implicit def deriveUnit[K <: Symbol, T <: HList]
(implicit
tail: NewInstanceExprs[T]
): NewInstanceExprs[FieldType[K, Unit] :: T] = new NewInstanceExprs[FieldType[K, Unit] :: T] {
def from(exprs: List[Expression]): Seq[Expression] =
Literal.fromObject(()) +: tail.from(exprs)
}
implicit def deriveNonUnit[K <: Symbol, V, T <: HList]
(implicit
notUnit: V =:!= Unit,
tail: NewInstanceExprs[T]
): NewInstanceExprs[FieldType[K, V] :: T] = new NewInstanceExprs[FieldType[K, V] :: T] {
def from(exprs: List[Expression]): Seq[Expression] = exprs.head +: tail.from(exprs.tail)
}
}
/**
* Drops fields with Unit type from labelled generic representation of types.
*
* @tparam L labelled generic representation of type fields
*/
trait DropUnitValues[L <: HList] extends DepFn1[L] with Serializable { type Out <: HList }
object DropUnitValues {
def apply[L <: HList](implicit dropUnitValues: DropUnitValues[L]): Aux[L, dropUnitValues.Out] = dropUnitValues
type Aux[L <: HList, Out0 <: HList] = DropUnitValues[L] { type Out = Out0 }
implicit def deriveHNil[H]: Aux[HNil, HNil] = new DropUnitValues[HNil] {
type Out = HNil
def apply(l: HNil): Out = HNil
}
implicit def deriveUnit[K <: Symbol, T <: HList, OutT <: HList]
(implicit
dropUnitValues : DropUnitValues.Aux[T, OutT]
): Aux[FieldType[K, Unit] :: T, OutT] = new DropUnitValues[FieldType[K, Unit] :: T] {
type Out = OutT
def apply(l : FieldType[K, Unit] :: T): Out = dropUnitValues(l.tail)
}
implicit def deriveNonUnit[K <: Symbol, V, T <: HList, OutH, OutT <: HList]
(implicit
nonUnit: V =:!= Unit,
dropUnitValues : DropUnitValues.Aux[T, OutT]
): Aux[FieldType[K, V] :: T, FieldType[K, V] :: OutT] = new DropUnitValues[FieldType[K, V] :: T] {
type Out = FieldType[K, V] :: OutT
def apply(l : FieldType[K, V] :: T): Out = l.head :: dropUnitValues(l.tail)
}
}
class RecordEncoder[F, G <: HList, H <: HList]
(implicit
i0: LabelledGeneric.Aux[F, G],
i1: DropUnitValues.Aux[G, H],
i2: IsHCons[H],
fields: Lazy[RecordEncoderFields[H]],
newInstanceExprs: Lazy[NewInstanceExprs[G]],
classTag: ClassTag[F]
) extends TypedEncoder[F] {
def nullable: Boolean = false
def jvmRepr: DataType = FramelessInternals.objectTypeFor[F]
def catalystRepr: DataType = {
val structFields = fields.value.value.map { field =>
StructField(
name = field.name,
dataType = field.encoder.catalystRepr,
nullable = field.encoder.nullable,
metadata = Metadata.empty
)
}
StructType(structFields)
}
def toCatalyst(path: Expression): Expression = {
val nameExprs = fields.value.value.map { field =>
Literal(field.name)
}
val valueExprs = fields.value.value.map { field =>
val fieldPath = Invoke(path, field.name, field.encoder.jvmRepr, Nil)
field.encoder.toCatalyst(fieldPath)
}
// the way exprs are encoded in CreateNamedStruct
val exprs = nameExprs.zip(valueExprs).flatMap {
case (nameExpr, valueExpr) => nameExpr :: valueExpr :: Nil
}
val createExpr = CreateNamedStruct(exprs)
val nullExpr = Literal.create(null, createExpr.dataType)
If(IsNull(path), nullExpr, createExpr)
}
def fromCatalyst(path: Expression): Expression = {
val exprs = fields.value.value.map { field =>
field.encoder.fromCatalyst(
GetStructField(path, field.ordinal, Some(field.name)))
}
val newArgs = newInstanceExprs.value.from(exprs)
val newExpr = NewInstance(
classTag.runtimeClass, newArgs, jvmRepr, propagateNull = true)
val nullExpr = Literal.create(null, jvmRepr)
If(IsNull(path), nullExpr, newExpr)
}
}
final class RecordFieldEncoder[T](
val encoder: TypedEncoder[T],
private[frameless] val jvmRepr: DataType,
private[frameless] val fromCatalyst: Expression => Expression,
private[frameless] val toCatalyst: Expression => Expression
) extends Serializable
object RecordFieldEncoder extends RecordFieldEncoderLowPriority {
/**
* @tparam F the value class
* @tparam G the single field of the value class
* @tparam H the single field of the value class (with guarantee it's not a `Unit` value)
* @tparam K the key type for the fields
* @tparam V the inner value type
*/
implicit def optionValueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]]
(implicit
i0: LabelledGeneric.Aux[F, G],
i1: DropUnitValues.Aux[G, H],
i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
i3: Keys.Aux[H, KS],
i4: IsHCons.Aux[KS, K, HNil],
i5: TypedEncoder[V],
i6: ClassTag[F]
): RecordFieldEncoder[Option[F]] = {
val fieldName = i4.head(i3()).name
val innerJvmRepr = ObjectType(i6.runtimeClass)
val catalyst: Expression => Expression = { path =>
val value = UnwrapOption(innerJvmRepr, path)
val javaValue = Invoke(value, fieldName, i5.jvmRepr, Nil)
i5.toCatalyst(javaValue)
}
val fromCatalyst: Expression => Expression = { path =>
val javaValue = i5.fromCatalyst(path)
val value = NewInstance(i6.runtimeClass, Seq(javaValue), innerJvmRepr)
WrapOption(value, innerJvmRepr)
}
val jvmr = ObjectType(classOf[Option[F]])
new RecordFieldEncoder[Option[F]](
encoder = new TypedEncoder[Option[F]] {
val nullable = true
val jvmRepr = jvmr
@inline def catalystRepr: DataType = i5.catalystRepr
def fromCatalyst(path: Expression): Expression = {
val javaValue = i5.fromCatalyst(path)
val value = NewInstance(
i6.runtimeClass, Seq(javaValue), innerJvmRepr)
WrapOption(value, innerJvmRepr)
}
def toCatalyst(path: Expression): Expression = catalyst(path)
override def toString: String = s"RecordFieldEncoder.optionValueClass[${i6.runtimeClass.getName}]('${fieldName}', $i5)"
},
jvmRepr = jvmr,
fromCatalyst = fromCatalyst,
toCatalyst = catalyst
)
}
/**
* @tparam F the value class
* @tparam G the single field of the value class
* @tparam H the single field of the value class (with guarantee it's not a `Unit` value)
* @tparam V the inner value type
*/
implicit def valueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]]
(implicit
i0: LabelledGeneric.Aux[F, G],
i1: DropUnitValues.Aux[G, H],
i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
i3: Keys.Aux[H, KS],
i4: IsHCons.Aux[KS, K, HNil],
i5: TypedEncoder[V],
i6: ClassTag[F]
): RecordFieldEncoder[F] = {
val cls = i6.runtimeClass
val jvmr = i5.jvmRepr
val fieldName = i4.head(i3()).name
new RecordFieldEncoder[F](
encoder = new TypedEncoder[F] {
def nullable = i5.nullable
def jvmRepr = jvmr
def catalystRepr: DataType = i5.catalystRepr
def fromCatalyst(path: Expression): Expression =
i5.fromCatalyst(path)
@inline def toCatalyst(path: Expression): Expression =
i5.toCatalyst(path)
override def toString: String = s"RecordFieldEncoder.valueClass[${cls.getName}]('${fieldName}', ${i5})"
},
jvmRepr = FramelessInternals.objectTypeFor[F],
fromCatalyst = { expr: Expression =>
NewInstance(
i6.runtimeClass,
i5.fromCatalyst(expr) :: Nil,
ObjectType(i6.runtimeClass))
},
toCatalyst = { expr: Expression =>
i5.toCatalyst(Invoke(expr, fieldName, jvmr))
}
)
}
}
private[frameless] sealed trait RecordFieldEncoderLowPriority {
implicit def apply[T](implicit e: TypedEncoder[T]): RecordFieldEncoder[T] = new RecordFieldEncoder[T](e, e.jvmRepr, e.fromCatalyst, e.toCatalyst)
}
================================================
FILE: dataset/src/main/scala/frameless/SparkDelay.scala
================================================
package frameless
import org.apache.spark.sql.SparkSession
trait SparkDelay[F[_]] {
def delay[A](a: => A)(implicit spark: SparkSession): F[A]
}
================================================
FILE: dataset/src/main/scala/frameless/TypedColumn.scala
================================================
package frameless
import frameless.functions.{litAggr, lit => flit}
import frameless.syntax._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.DecimalType
import org.apache.spark.sql.{Column, FramelessInternals}
import shapeless._
import shapeless.ops.record.Selector
import scala.annotation.implicitNotFound
import scala.reflect.ClassTag
import scala.language.experimental.macros
sealed trait UntypedExpression[T] {
def expr: Expression
def uencoder: TypedEncoder[_]
override def toString: String = expr.toString()
}
/** Expression used in `select`-like constructions.
*/
sealed class TypedColumn[T, U](expr: Expression)(
implicit val uenc: TypedEncoder[U]
) extends AbstractTypedColumn[T, U](expr) {
type ThisType[A, B] = TypedColumn[A, B]
def this(column: Column)(implicit uencoder: TypedEncoder[U]) =
this(FramelessInternals.expr(column))
override def typed[W, U1: TypedEncoder](c: Column): TypedColumn[W, U1] = c.typedColumn
override def lit[U1: TypedEncoder](c: U1): TypedColumn[T, U1] = flit(c)
}
/** Expression used in `agg`-like constructions.
*/
sealed class TypedAggregate[T, U](expr: Expression)(
implicit val uenc: TypedEncoder[U]
) extends AbstractTypedColumn[T, U](expr) {
type ThisType[A, B] = TypedAggregate[A, B]
def this(column: Column)(implicit uencoder: TypedEncoder[U]) = {
this(FramelessInternals.expr(column))
}
override def typed[W, U1: TypedEncoder](c: Column): TypedAggregate[W, U1] = c.typedAggregate
override def lit[U1: TypedEncoder](c: U1): TypedAggregate[T, U1] = litAggr(c)
}
/** Generic representation of a typed column. A typed column can either be a [[TypedAggregate]] or
* a [[frameless.TypedColumn]].
*
* Documentation marked "apache/spark" is thanks to apache/spark Contributors
* at https://github.com/apache/spark, licensed under Apache v2.0 available at
* http://www.apache.org/licenses/LICENSE-2.0
*
* @tparam T phantom type representing the dataset on which this columns is
* selected. When `T = A with B` the selection is on either A or B.
* @tparam U type of column
*/
abstract class AbstractTypedColumn[T, U]
(val expr: Expression)
(implicit val uencoder: TypedEncoder[U])
extends UntypedExpression[T] { self =>
type ThisType[A, B] <: AbstractTypedColumn[A, B]
/** A helper class to make to simplify working with Optional fields.
*
* {{{
* val x: TypedColumn[Option[Int]] = _
* x.opt.map(_*2) // This only compiles if the type of x is Option[X] (in this example X is of type Int)
* }}}
*
* @note Known issue: map() will NOT work when the applied function is a udf().
* It will compile and then throw a runtime error.
**/
trait Mapper[X] {
def map[G, OutputType[_,_]](u: ThisType[T, X] => OutputType[T,G])
(implicit
ev: OutputType[T,G] <:< AbstractTypedColumn[T, G]
): OutputType[T, Option[G]] = {
u(self.asInstanceOf[ThisType[T, X]]).asInstanceOf[OutputType[T, Option[G]]]
}
}
/** Makes it easier to work with Optional columns. It returns an instance of `Mapper[X]`
* where `X` is type of the unwrapped Optional. E.g., in the case of `Option[Long]`,
* `X` is of type Long.
*
* {{{
* val x: TypedColumn[Option[Int]] = _
* x.opt.map(_*2)
* }}}
* */
def opt[X](implicit x: U <:< Option[X]): Mapper[X] = new Mapper[X] {}
/** Fall back to an untyped Column */
def untyped: Column = new Column(expr)
private def equalsTo[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed {
if (uencoder.nullable) EqualNullSafe(self.expr, other.expr)
else EqualTo(self.expr, other.expr)
}
/** Creates a typed column of either TypedColumn or TypedAggregate from an expression. */
protected def typed[W, U1: TypedEncoder](e: Expression): ThisType[W, U1] =
typed(new Column(e))
/** Creates a typed column of either TypedColumn or TypedAggregate. */
def typed[W, U1: TypedEncoder](c: Column): ThisType[W, U1]
/** Creates a typed column of either TypedColumn or TypedAggregate. */
def lit[U1: TypedEncoder](c: U1): ThisType[T, U1]
/** Equality test.
* {{{
* df.filter( df.col('a) === 1 )
* }}}
*
* apache/spark
*/
def ===(u: U): ThisType[T, Boolean] =
equalsTo(lit(u))
/** Equality test.
* {{{
* df.filter( df.col('a) === df.col('b) )
* }}}
*
* apache/spark
*/
def ===[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
equalsTo(other)
/** Inequality test.
*
* {{{
* df.filter(df.col('a) =!= df.col('b))
* }}}
*
* apache/spark
*/
def =!=[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(Not(equalsTo(other).expr))
/** Inequality test.
*
* {{{
* df.filter(df.col('a) =!= "a")
* }}}
*
* apache/spark
*/
def =!=(u: U): ThisType[T, Boolean] = typed(Not(equalsTo(lit(u)).expr))
/** True if the current expression is an Option and it's None.
*
* apache/spark
*/
def isNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] =
typed(IsNull(expr))
/** True if the current expression is an Option and it's not None.
*
* apache/spark
*/
def isNotNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] =
typed(IsNotNull(expr))
/** True if the current expression is a fractional number and is not NaN.
*
* apache/spark
*/
def isNaN(implicit n: CatalystNaN[U]): ThisType[T, Boolean] =
typed(self.untyped.isNaN)
/**
* True if the value for this optional column `exists` as expected
* (see `Option.exists`).
*
* {{{
* df.col('opt).isSome(_ === someOtherCol)
* }}}
*/
def isSome[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, false)
/**
* True if the value for this optional column `exists` as expected,
* or is `None`. (see `Option.forall`).
*
* {{{
* df.col('opt).isSomeOrNone(_ === someOtherCol)
* }}}
*/
def isSomeOrNone[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, true)
private def someOr[V](exists: ThisType[T, V] => ThisType[T, Boolean], default: Boolean)(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = {
val defaultExpr = if (default) Literal.TrueLiteral else Literal.FalseLiteral
typed(Coalesce(Seq(opt(i0).map(exists).expr, defaultExpr)))
}
/** Convert an Optional column by providing a default value.
*
* {{{
* df(df('opt).getOrElse(df('defaultValue)))
* }}}
*/
def getOrElse[TT, W, Out](default: ThisType[TT, Out])(implicit i0: U =:= Option[Out], i1: With.Aux[T, TT, W]): ThisType[W, Out] =
typed(Coalesce(Seq(expr, default.expr)))(default.uencoder)
/** Convert an Optional column by providing a default value.
*
* {{{
* df( df('opt).getOrElse(defaultConstant) )
* }}}
*/
def getOrElse[Out: TypedEncoder](default: Out)(implicit i0: U =:= Option[Out]): ThisType[T, Out] =
getOrElse(lit[Out](default))
/** Sum of this expression and another expression.
*
* {{{
* // The following selects the sum of a person's height and weight.
* people.select( people.col('height) plus people.col('weight) )
* }}}
*
* apache/spark
*/
def plus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
typed(self.untyped.plus(other.untyped))
/** Sum of this expression and another expression.
* {{{
* // The following selects the sum of a person's height and weight.
* people.select( people.col('height) + people.col('weight) )
* }}}
*
* apache/spark
*/
def +[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
plus(other)
/** Sum of this expression (column) with a constant.
* {{{
* // The following selects the sum of a person's height and weight.
* people.select( people('height) + 2 )
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def +(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
typed(self.untyped.plus(u))
/**
* Inversion of boolean expression, i.e. NOT.
* {{{
* // Select rows that are not active (isActive === false)
* df.filter( !df('isActive) )
* }}}
*
* apache/spark
*/
def unary_!(implicit i0: U <:< Boolean): ThisType[T, Boolean] =
typed(!untyped)
/** Unary minus, i.e. negate the expression.
* {{{
* // Select the amount column and negates all values.
* df.select( -df('amount) )
* }}}
*
* apache/spark
*/
def unary_-(implicit n: CatalystNumeric[U]): ThisType[T, U] =
typed(-self.untyped)
/** Subtraction. Subtract the other expression from this expression.
* {{{
* // The following selects the difference between people's height and their weight.
* people.select( people.col('height) minus people.col('weight) )
* }}}
*
* apache/spark
*/
def minus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
typed(self.untyped.minus(other.untyped))
/** Subtraction. Subtract the other expression from this expression.
* {{{
* // The following selects the difference between people's height and their weight.
* people.select( people.col('height) - people.col('weight) )
* }}}
*
* apache/spark
*/
def -[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
minus(other)
/** Subtraction. Subtract the other expression from this expression.
* {{{
* // The following selects the difference between people's height and their weight.
* people.select( people('height) - 1 )
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def -(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
typed(self.untyped.minus(u))
/** Multiplication of this expression and another expression.
* {{{
* // The following multiplies a person's height by their weight.
* people.select( people.col('height) multiply people.col('weight) )
* }}}
*
* apache/spark
*/
def multiply[TT, W]
(other: ThisType[TT, U])
(implicit
n: CatalystNumeric[U],
w: With.Aux[T, TT, W],
t: ClassTag[U]
): ThisType[W, U] = typed {
if (t.runtimeClass == BigDecimal(0).getClass) {
// That's apparently the only way to get sound multiplication.
// See https://issues.apache.org/jira/browse/SPARK-22036
val dt = DecimalType(20, 14)
self.untyped.cast(dt).multiply(other.untyped.cast(dt))
} else {
self.untyped.multiply(other.untyped)
}
}
/** Multiplication of this expression and another expression.
* {{{
* // The following multiplies a person's height by their weight.
* people.select( people.col('height) * people.col('weight) )
* }}}
*
* apache/spark
*/
def *[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W], t: ClassTag[U]): ThisType[W, U] =
multiply(other)
/** Multiplication of this expression a constant.
* {{{
* // The following multiplies a person's height by their weight.
* people.select( people.col('height) * people.col('weight) )
* }}}
*
* apache/spark
*/
def *(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
typed(self.untyped.multiply(u))
/** Modulo (a.k.a. remainder) expression.
*
* apache/spark
*/
def mod[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, Out] =
typed(self.untyped.mod(other.untyped))
/** Modulo (a.k.a. remainder) expression.
*
* apache/spark
*/
def %[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
mod(other)
/** Modulo (a.k.a. remainder) expression.
*
* apache/spark
*/
def %(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
typed(self.untyped.mod(u))
/** Division this expression by another expression.
* {{{
* // The following divides a person's height by their weight.
* people.select( people('height) / people('weight) )
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def divide[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], w: With.Aux[T, TT, W]): ThisType[W, Out] =
typed(self.untyped.divide(other.untyped))
/** Division this expression by another expression.
* {{{
* // The following divides a person's height by their weight.
* people.select( people('height) / people('weight) )
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def /[Out, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], e: TypedEncoder[Out], w: With.Aux[T, TT, W]): ThisType[W, Out] =
divide(other)
/** Division this expression by another expression.
* {{{
* // The following divides a person's height by their weight.
* people.select( people('height) / 2 )
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def /(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, Double] =
typed(self.untyped.divide(u))
/** Returns a descending ordering used in sorting
*
* apache/spark
*/
def desc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] =
new SortedTypedColumn[T, U](untyped.desc)
/** Returns an ascending ordering used in sorting
*
* apache/spark
*/
def asc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] =
new SortedTypedColumn[T, U](untyped.asc)
/** Bitwise AND this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseAND (df.col('colB)))
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def bitwiseAND(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
typed(self.untyped.bitwiseAND(u))
/** Bitwise AND this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseAND (df.col('colB)))
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def bitwiseAND[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
typed(self.untyped.bitwiseAND(other.untyped))
/** Bitwise AND this expression and another expression (of same type).
* {{{
* df.select(df.col('colA).cast[Int] & -1)
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def &(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
bitwiseAND(u)
/** Bitwise AND this expression and another expression.
* {{{
* df.select(df.col('colA) & (df.col('colB)))
* }}}
*
* @param other a constant of the same type
* apache/spark
*/
def &[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
bitwiseAND(other)
/** Bitwise OR this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseOR (df.col('colB)))
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def bitwiseOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
typed(self.untyped.bitwiseOR(u))
/** Bitwise OR this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseOR (df.col('colB)))
* }}}
*
* @param other a constant of the same type
* apache/spark
*/
def bitwiseOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
typed(self.untyped.bitwiseOR(other.untyped))
/** Bitwise OR this expression and another expression (of same type).
* {{{
* df.select(df.col('colA).cast[Long] | 1L)
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def |(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
bitwiseOR(u)
/** Bitwise OR this expression and another expression.
* {{{
* df.select(df.col('colA) | (df.col('colB)))
* }}}
*
* @param other a constant of the same type
* apache/spark
*/
def |[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
bitwiseOR(other)
/** Bitwise XOR this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseXOR (df.col('colB)))
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def bitwiseXOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
typed(self.untyped.bitwiseXOR(u))
/** Bitwise XOR this expression and another expression.
* {{{
* df.select(df.col('colA) bitwiseXOR (df.col('colB)))
* }}}
*
* @param other a constant of the same type
* apache/spark
*/
def bitwiseXOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
typed(self.untyped.bitwiseXOR(other.untyped))
/** Bitwise XOR this expression and another expression (of same type).
* {{{
* df.select(df.col('colA).cast[Long] ^ 1L)
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def ^(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
bitwiseXOR(u)
/** Bitwise XOR this expression and another expression.
* {{{
* df.select(df.col('colA) ^ (df.col('colB)))
* }}}
*
* @param other a constant of the same type
* apache/spark
*/
def ^[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
bitwiseXOR(other)
/** Casts the column to a different type.
* {{{
* df.select(df('a).cast[Int])
* }}}
*/
def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] =
typed(self.untyped.cast(TypedEncoder[A].catalystRepr))
/**
* An expression that returns a substring
* {{{
* df.select(df('a).substr(0, 5))
* }}}
*
* @param startPos starting position
* @param len length of the substring
*/
def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] =
typed(self.untyped.substr(startPos, len))
/**
* An expression that returns a substring
* {{{
* df.select(df('a).substr(df('b), df('c)))
* }}}
*
* @param startPos expression for the starting position
* @param len expression for the length of the substring
*/
def substr[TT1, TT2, W1, W2](startPos: ThisType[TT1, Int], len: ThisType[TT2, Int])
(implicit
ev: U =:= String,
w1: With.Aux[T, TT1, W1],
w2: With.Aux[W1, TT2, W2]): ThisType[W2, String] =
typed(self.untyped.substr(startPos.untyped, len.untyped))
/** SQL like expression. Returns a boolean column based on a SQL LIKE match.
* {{{
* val ds = TypedDataset.create(X2("foo", "bar") :: Nil)
* // true
* ds.select(ds('a).like("foo"))
*
* // Selected column has value "bar"
* ds.select(when(ds('a).like("f"), ds('a)).otherwise(ds('b))
* }}}
* apache/spark
*/
def like(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
typed(self.untyped.like(literal))
/** SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match.
* {{{
* val ds = TypedDataset.create(X1("foo") :: Nil)
* // true
* ds.select(ds('a).rlike("foo"))
*
* // true
* ds.select(ds('a).rlike(".*))
* }}}
* apache/spark
*/
def rlike(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
typed(self.untyped.rlike(literal))
/** String contains another string literal.
* {{{
* df.filter ( df.col('a).contains("foo") )
* }}}
*
* @param other a string that is being tested against.
* apache/spark
*/
def contains(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
typed(self.untyped.contains(other))
/** String contains.
* {{{
* df.filter ( df.col('a).contains(df.col('b) )
* }}}
*
* @param other a column which values is used as a string that is being tested against.
* apache/spark
*/
def contains[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped.contains(other.untyped))
/** String starts with another string literal.
* {{{
* df.filter ( df.col('a).startsWith("foo")
* }}}
*
* @param other a prefix that is being tested against.
* apache/spark
*/
def startsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
typed(self.untyped.startsWith(other))
/** String starts with.
* {{{
* df.filter ( df.col('a).startsWith(df.col('b))
* }}}
*
* @param other a column which values is used as a prefix that is being tested against.
* apache/spark
*/
def startsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped.startsWith(other.untyped))
/** String ends with another string literal.
* {{{
* df.filter ( df.col('a).endsWith("foo")
* }}}
*
* @param other a suffix that is being tested against.
* apache/spark
*/
def endsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
typed(self.untyped.endsWith(other))
/** String ends with.
* {{{
* df.filter ( df.col('a).endsWith(df.col('b))
* }}}
*
* @param other a column which values is used as a suffix that is being tested against.
* apache/spark
*/
def endsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped.endsWith(other.untyped))
/** Boolean AND.
* {{{
* df.filter ( (df.col('a) === 1).and(df.col('b) > 5) )
* }}}
*/
def and[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped.and(other.untyped))
/** Boolean AND.
* {{{
* df.filter ( df.col('a) === 1 && df.col('b) > 5)
* }}}
*/
def && [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
and(other)
/** Boolean OR.
* {{{
* df.filter ( (df.col('a) === 1).or(df.col('b) > 5) )
* }}}
*/
def or[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped.or(other.untyped))
/** Boolean OR.
* {{{
* df.filter ( df.col('a) === 1 || df.col('b) > 5)
* }}}
*/
def || [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
or(other)
/** Less than.
*
* {{{
* // The following selects people younger than the maxAge column.
* df.select(df('age) < df('maxAge) )
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def <[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped < other.untyped)
/** Less than or equal to.
*
* {{{
* // The following selects people younger or equal than the maxAge column.
* df.select(df('age) <= df('maxAge)
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def <=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped <= other.untyped)
/** Greater than.
* {{{
* // The following selects people older than the maxAge column.
* df.select( df('age) > df('maxAge) )
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def >[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped > other.untyped)
/** Greater than or equal.
* {{{
* // The following selects people older or equal than the maxAge column.
* df.select( df('age) >= df('maxAge) )
* }}}
*
* @param other another column of the same type
* apache/spark
*/
def >=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
typed(self.untyped >= other.untyped)
/** Less than.
* {{{
* // The following selects people younger than 21.
* df.select( df('age) < 21 )
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def <(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
typed(self.untyped < lit(u)(self.uencoder).untyped)
/** Less than or equal to.
* {{{
* // The following selects people younger than 22.
* df.select( df('age) <= 2 )
* }}}
*
* @param u a constant of the same type
* apache/spark
*/
def <=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
typed(self.untyped <= lit(u)(self.uencoder).untyped)
/** Greater than.
* {{{
* // The following selects people older than 21.
* df.select( df('age) > 21 )
* }}}
*
* @param u another column of the same type
* apache/spark
*/
def >(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
typed(self.untyped > lit(u)(self.uencoder).untyped)
/** Greater than or equal.
* {{{
* // The following selects people older than 20.
* df.select( df('age) >= 21 )
* }}}
*
* @param u another column of the same type
* apache/spark
*/
def >=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
typed(self.untyped >= lit(u)(self.uencoder).untyped)
/**
* Returns true if the value of this column is contained in of the arguments.
* {{{
* // The following selects people with age 15, 20, or 30.
* df.select( df('age).isin(15, 20, 30) )
* }}}
*
* @param values are constants of the same type
* apache/spark
*/
def isin(values: U*)(implicit e: CatalystIsin[U]): ThisType[T, Boolean] =
typed(self.untyped.isin(values:_*))
/**
* True if the current column is between the lower bound and upper bound, inclusive.
*
* @param lowerBound a constant of the same type
* @param upperBound a constant of the same type
* apache/spark
*/
def between(lowerBound: U, upperBound: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
typed(self.untyped.between(lit(lowerBound)(self.uencoder).untyped, lit(upperBound)(self.uencoder).untyped))
/**
* True if the current column is between the lower bound and upper bound, inclusive.
*
* @param lowerBound another column of the same type
* @param upperBound another column of the same type
* apache/spark
*/
def between[TT1, TT2, W1, W2](lowerBound: ThisType[TT1, U], upperBound: ThisType[TT2, U])
(implicit
i0: CatalystOrdered[U],
w0: With.Aux[T, TT1, W1],
w1: With.Aux[TT2, W1, W2]
): ThisType[W2, Boolean] =
typed(self.untyped.between(lowerBound.untyped, upperBound.untyped))
/**
* Returns a nested column matching the field `symbol`.
*
* @param symbol the field symbol
* @tparam V the type of the nested field
*/
def field[V](symbol: Witness.Lt[Symbol])(implicit
i0: TypedColumn.Exists[U, symbol.T, V],
i1: TypedEncoder[V]
): ThisType[T, V] =
typed(self.untyped.getField(symbol.value.name))
}
sealed class SortedTypedColumn[T, U](val expr: Expression)(
implicit
val uencoder: TypedEncoder[U]
) extends UntypedExpression[T] {
def this(column: Column)(implicit e: TypedEncoder[U]) = {
this(FramelessInternals.expr(column))
}
def untyped: Column = new Column(expr)
}
object SortedTypedColumn {
implicit def defaultAscending[T, U : CatalystOrdered](typedColumn: TypedColumn[T, U]): SortedTypedColumn[T, U] =
new SortedTypedColumn[T, U](typedColumn.untyped.asc)(typedColumn.uencoder)
object defaultAscendingPoly extends Poly1 {
implicit def caseTypedColumn[T, U : CatalystOrdered] = at[TypedColumn[T, U]](c => defaultAscending(c))
implicit def caseTypeSortedColumn[T, U] = at[SortedTypedColumn[T, U]](identity)
}
}
object TypedColumn {
/** Evidence that type `T` has column `K` with type `V`. */
@implicitNotFound(msg = "No column ${K} of type ${V} in ${T}")
trait Exists[T, K, V]
@implicitNotFound(msg = "No columns ${K} of type ${V} in ${T}")
trait ExistsMany[T, K <: HList, V]
object ExistsMany {
implicit def deriveCons[T, KH, KT <: HList, V0, V1]
(implicit
head: Exists[T, KH, V0],
tail: ExistsMany[V0, KT, V1]
): ExistsMany[T, KH :: KT, V1] =
new ExistsMany[T, KH :: KT, V1] {}
implicit def deriveHNil[T, K, V](implicit head: Exists[T, K, V]): ExistsMany[T, K :: HNil, V] =
new ExistsMany[T, K :: HNil, V] {}
}
object Exists {
def apply[T, V](column: Witness)(implicit e: Exists[T, column.T, V]): Exists[T, column.T, V] = e
implicit def deriveRecord[T, H <: HList, K, V]
(implicit
i0: LabelledGeneric.Aux[T, H],
i1: Selector.Aux[H, K, V]
): Exists[T, K, V] = new Exists[T, K, V] {}
}
/**
* {{{
* import frameless.TypedColumn
*
* case class Foo(id: Int, bar: String)
*
* val colbar: TypedColumn[Foo, String] = TypedColumn { foo: Foo => foo.bar }
* val colid = TypedColumn[Foo, Int](_.id)
* }}}
*/
def apply[T, U](x: T => U): TypedColumn[T, U] =
macro TypedColumnMacroImpl.applyImpl[T, U]
}
================================================
FILE: dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala
================================================
package frameless
import scala.reflect.macros.whitebox
private[frameless] object TypedColumnMacroImpl {
def applyImpl[T: c.WeakTypeTag, U: c.WeakTypeTag](c: whitebox.Context)(x: c.Tree): c.Expr[TypedColumn[T, U]] = {
import c.universe._
val t = c.weakTypeOf[T]
val u = c.weakTypeOf[U]
def buildExpression(path: List[String]): c.Expr[TypedColumn[T, U]] = {
val columnName = path.mkString(".")
c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($columnName)).expr)")
}
def abort(msg: String) = c.abort(c.enclosingPosition, msg)
@annotation.tailrec
def path(in: Select, out: List[TermName]): List[TermName] =
in.qualifier match {
case sub: Select =>
path(sub, in.name.toTermName :: out)
case id: Ident =>
id.name.toTermName :: in.name.toTermName :: out
case u =>
abort(s"Unsupported selection: $u")
}
@annotation.tailrec
def check(current: Type, in: List[TermName]): Boolean = in match {
case next :: tail => {
val sym = current.decl(next).asTerm
if (!sym.isStable) {
abort(s"Stable term expected: ${current}.${next}")
}
check(sym.info, tail)
}
case _ =>
true
}
x match {
case fn: Function => fn.body match {
case select: Select if select.name.isTermName =>
val expectedRoot: Option[String] = fn.vparams match {
case List(rt) if rt.rhs == EmptyTree =>
Option.empty[String]
case List(rt) =>
Some(rt.toString)
case u =>
abort(s"Select expression must have a single parameter: ${u mkString ", "}")
}
path(select, List.empty) match {
case root :: tail if (
expectedRoot.forall(_ == root) && check(t, tail)) => {
val colPath = tail.mkString(".")
c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($colPath)).expr)")
}
case _ =>
abort(s"Invalid select expression: $select")
}
case t =>
abort(s"Select expression expected: $t")
}
case _ =>
abort(s"Function expected: $x")
}
}
}
================================================
FILE: dataset/src/main/scala/frameless/TypedDataset.scala
================================================
package frameless
import java.util
import frameless.functions.CatalystExplodableCollection
import frameless.ops._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Dataset, FramelessInternals, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint}
import org.apache.spark.sql.catalyst.plans.Inner
import org.apache.spark.sql.types.StructType
import shapeless._
import shapeless.labelled.FieldType
import shapeless.ops.hlist.{Diff, IsHCons, Mapper, Prepend, ToTraversable, Tupler}
import shapeless.ops.record.{Keys, Modifier, Remover, Values}
import scala.language.experimental.macros
/** [[TypedDataset]] is a safer interface for working with `Dataset`.
*
* NOTE: Prefer `TypedDataset.create` over `new TypedDataset` unless you
* know what you are doing.
*
* Documentation marked "apache/spark" is thanks to apache/spark Contributors
* at https://github.com/apache/spark, licensed under Apache v2.0 available at
* http://www.apache.org/licenses/LICENSE-2.0
*/
class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val encoder: TypedEncoder[T])
extends TypedDatasetForwarded[T] { self =>
private implicit val spark: SparkSession = dataset.sparkSession
/** Aggregates on the entire Dataset without groups.
*
* apache/spark
*/
def agg[A](ca: TypedAggregate[T, A]): TypedDataset[A] = {
implicit val ea = ca.uencoder
val tuple1: TypedDataset[Tuple1[A]] = aggMany(ca)
// now we need to unpack `Tuple1[A]` to `A`
TypedEncoder[A].catalystRepr match {
case StructType(_) =>
// if column is struct, we use all its fields
val df = tuple1
.dataset
.selectExpr("_1.*")
.as[A](TypedExpressionEncoder[A])
TypedDataset.create(df)
case other =>
// for primitive types `Tuple1[A]` has the same schema as `A`
TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A]))
}
}
/** Aggregates on the entire Dataset without groups.
*
* apache/spark
*/
def agg[A, B](
ca: TypedAggregate[T, A],
cb: TypedAggregate[T, B]
): TypedDataset[(A, B)] = {
implicit val (ea, eb) = (ca.uencoder, cb.uencoder)
aggMany(ca, cb)
}
/** Aggregates on the entire Dataset without groups.
*
* apache/spark
*/
def agg[A, B, C](
ca: TypedAggregate[T, A],
cb: TypedAggregate[T, B],
cc: TypedAggregate[T, C]
): TypedDataset[(A, B, C)] = {
implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder)
aggMany(ca, cb, cc)
}
/** Aggregates on the entire Dataset without groups.
*
* apache/spark
*/
def agg[A, B, C, D](
ca: TypedAggregate[T, A],
cb: TypedAggregate[T, B],
cc: TypedAggregate[T, C],
cd: TypedAggregate[T, D]
): TypedDataset[(A, B, C, D)] = {
implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder)
aggMany(ca, cb, cc, cd)
}
/** Aggregates on the entire Dataset without groups.
*
* apache/spark
*/
object aggMany extends ProductArgs {
def applyProduct[U <: HList, Out0 <: HList, Out](columns: U)
(implicit
i0: AggregateTypes.Aux[T, U, Out0],
i1: ToTraversable.Aux[U, List, UntypedExpression[T]],
i2: Tupler.Aux[Out0, Out],
i3: TypedEncoder[Out]
): TypedDataset[Out] = {
val underlyingColumns = columns.toList[UntypedExpression[T]]
val cols: Seq[Column] = for {
(c, i) <- columns.toList[UntypedExpression[T]].zipWithIndex
} yield new Column(c.expr).as(s"_${i+1}")
// Workaround to SPARK-20346. One alternative is to allow the result to be Vector(null) for empty DataFrames.
// Another one would be to return an Option.
val filterStr = (
for {
(c, i) <- underlyingColumns.zipWithIndex
if !c.uencoder.nullable
} yield s"_${i+1} is not null"
).mkString(" or ")
val selected = dataset.toDF().agg(cols.head, cols.tail:_*).as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](if (filterStr.isEmpty) selected else selected.filter(filterStr))
}
}
/** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. */
def as[U]()(implicit as: As[T, U]): TypedDataset[U] = {
implicit val uencoder = as.encoder
TypedDataset.create(dataset.as[U](TypedExpressionEncoder[U]))
}
/** Returns a checkpointed version of this [[TypedDataset]]. Checkpointing can be used to truncate the
* logical plan of this Dataset, which is especially useful in iterative algorithms where the
* plan may grow exponentially. It will be saved to files inside the checkpoint
* directory set with `SparkContext#setCheckpointDir`.
*
* Differs from `Dataset#checkpoint` by wrapping its result into an effect-suspending `F[_]`.
*
* apache/spark
*/
def checkpoint[F[_]](eager: Boolean)(implicit F: SparkDelay[F]): F[TypedDataset[T]] =
F.delay(TypedDataset.create[T](dataset.checkpoint(eager)))
/** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type.
* Unlike `as` the projection U may include a subset of the columns of T and the column names and types must agree.
*
* {{{
* case class Foo(i: Int, j: String)
* case class Bar(j: String)
*
* val t: TypedDataset[Foo] = ...
* val b: TypedDataset[Bar] = t.project[Bar]
*
* case class BarErr(e: String)
* // The following does not compile because `Foo` doesn't have a field with name `e`
* val e: TypedDataset[BarErr] = t.project[BarErr]
* }}}
*/
def project[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = projector.apply(this)
/** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]]
* combined.
*
* Note that, this function is not a typical set union operation, in that it does not eliminate
* duplicate items. As such, it is analogous to `UNION ALL` in SQL.
*
* Differs from `Dataset#union` by aligning fields if possible.
* It will not compile if `Datasets` have not compatible schema.
*
* Example:
* {{{
* case class Foo(x: Int, y: Long)
* case class Bar(y: Long, x: Int)
* case class Faz(x: Int, y: Int, z: Int)
*
* foo: TypedDataset[Foo] = ...
* bar: TypedDataset[Bar] = ...
* faz: TypedDataset[Faz] = ...
*
* foo union bar: TypedDataset[Foo]
* foo union faz: TypedDataset[Foo]
* // won't compile, you need to reverse order, you can't project from less fields to more
* faz union foo
*
* }}}
*
* apache/spark
*/
def union[U: TypedEncoder](other: TypedDataset[U])(implicit projector: SmartProject[U, T]): TypedDataset[T] =
TypedDataset.create(dataset.union(other.project[T].dataset))
/** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]]
* combined.
*
* Note that, this function is not a typical set union operation, in that it does not eliminate
* duplicate items. As such, it is analogous to `UNION ALL` in SQL.
*
* apache/spark
*/
def union(other: TypedDataset[T]): TypedDataset[T] = {
TypedDataset.create(dataset.union(other.dataset))
}
/** Returns the number of elements in the [[TypedDataset]].
*
* Differs from `Dataset#count` by wrapping its result into an effect-suspending `F[_]`.
*/
def count[F[_]]()(implicit F: SparkDelay[F]): F[Long] =
F.delay(dataset.count())
/** Returns `TypedColumn` of type `A` given its name (alias for `col`).
*
* {{{
* tf('id)
* }}}
*
* It is statically checked that column with such name exists and has type `A`.
*/
def apply[A](column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, A],
i1: TypedEncoder[A]
): TypedColumn[T, A] = col(column)
/** Returns `TypedColumn` of type `A` given its name.
*
* {{{
* tf.col('id)
* }}}
*
* It is statically checked that column with such name exists and has type `A`.
*/
def col[A](column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, A],
i1: TypedEncoder[A]
): TypedColumn[T, A] =
new TypedColumn[T, A](dataset(column.value.name).as[A](TypedExpressionEncoder[A]))
/** Returns `TypedColumn` of type `A` given a lambda indicating the field.
*
* {{{
* td.col(_.id)
* }}}
*
* It is statically checked that column with such name exists and has type `A`.
*/
def col[A](x: Function1[T, A]): TypedColumn[T, A] =
macro TypedColumnMacroImpl.applyImpl[T, A]
/** Projects the entire `TypedDataset[T]` into a single column of type `TypedColumn[T,T]`.
* {{{
* ts: TypedDataset[Foo] = ...
* ts.select(ts.asCol, ts.asCol): TypedDataset[(Foo,Foo)]
* }}}
*/
def asCol: TypedColumn[T, T] = {
val projectedColumn: Column = encoder.catalystRepr match {
case StructType(_) =>
val allColumns: Array[Column] = dataset.columns.map(dataset.col)
org.apache.spark.sql.functions.struct(allColumns.toSeq: _*)
case _ =>
dataset.col(dataset.columns.head)
}
new TypedColumn[T,T](projectedColumn)
}
/** References the entire `TypedDataset[T]` as a single column
* of type `TypedColumn[T,T]` so it can be used in a join operation.
*
* {{{
* def nameJoin(ds1: TypedDataset[Person], ds2: TypedDataset[Name]) =
* ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue)
* }}}
*/
def asJoinColValue(implicit i0: IsValueClass[T]): TypedColumn[T, T] = {
import _root_.frameless.syntax._
dataset.col("value").typedColumn
}
object colMany extends SingletonProductArgs {
def applyProduct[U <: HList, Out](columns: U)
(implicit
i0: TypedColumn.ExistsMany[T, U, Out],
i1: TypedEncoder[Out],
i2: ToTraversable.Aux[U, List, Symbol]
): TypedColumn[T, Out] = {
val names = columns.toList[Symbol].map(_.name)
val colExpr = FramelessInternals.resolveExpr(dataset, names)
new TypedColumn[T, Out](colExpr)
}
}
/** Right hand side disambiguation of `col` for join expressions.
* To be used when writting self-joins, noop in other circumstances.
*
* Note: In vanilla Spark, disambiguation in self-joins is acheaved using
* String based aliases, which is obviously unsafe.
*/
def colRight[A](column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, A],
i1: TypedEncoder[A]
): TypedColumn[T, A] =
new TypedColumn[T, A](FramelessInternals.DisambiguateRight(col(column).expr))
/** Left hand side disambiguation of `col` for join expressions.
* To be used when writting self-joins, noop in other circumstances.
*
* Note: In vanilla Spark, disambiguation in self-joins is acheaved using
* String based aliases, which is obviously unsafe.
*/
def colLeft[A](column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, A],
i1: TypedEncoder[A]
): TypedColumn[T, A] =
new TypedColumn[T, A](FramelessInternals.DisambiguateLeft(col(column).expr))
/** Returns a `Seq` that contains all the elements in this [[TypedDataset]].
*
* Running this operation requires moving all the data into the application's driver process, and
* doing so on a very large [[TypedDataset]] can crash the driver process with OutOfMemoryError.
*
* Differs from `Dataset#collect` by wrapping its result into an effect-suspending `F[_]`.
*/
def collect[F[_]]()(implicit F: SparkDelay[F]): F[Seq[T]] =
F.delay(dataset.collect().toSeq)
/** Optionally returns the first element in this [[TypedDataset]].
*
* Differs from `Dataset#first` by wrapping its result into an `Option` and an effect-suspending `F[_]`.
*/
def firstOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] =
F.delay {
try {
Option(dataset.first())
} catch {
case e: NoSuchElementException => None
}
}
/** Returns the first `num` elements of this [[TypedDataset]] as a `Seq`.
*
* Running take requires moving data into the application's driver process, and doing so with
* a very large `num` can crash the driver process with OutOfMemoryError.
*
* Differs from `Dataset#take` by wrapping its result into an effect-suspending `F[_]`.
*
* apache/spark
*/
def take[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] =
F.delay(dataset.take(num).toSeq)
/** Return an iterator that contains all rows in this [[TypedDataset]].
*
* The iterator will consume as much memory as the largest partition in this [[TypedDataset]].
*
* NOTE: this results in multiple Spark jobs, and if the input [[TypedDataset]] is the result
* of a wide transformation (e.g. join with different partitioners), to avoid
* recomputing the input [[TypedDataset]] should be cached first.
*
* Differs from `Dataset#toLocalIterator()` by wrapping its result into an effect-suspending `F[_]`.
*
* apache/spark
*/
def toLocalIterator[F[_]]()(implicit F: SparkDelay[F]): F[util.Iterator[T]] =
F.delay(dataset.toLocalIterator())
/** Alias for firstOption().
*/
def headOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = firstOption()
/** Alias for take().
*/
def head[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = take(num)
// $COVERAGE-OFF$
/** Alias for firstOption().
*/
@deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0")
def head: T = dataset.head()
/** Alias for firstOption().
*/
@deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0")
def first: T = dataset.head()
// $COVERAGE-ONN$
/** Displays the content of this [[TypedDataset]] in a tabular form. Strings more than 20 characters
* will be truncated, and all cells will be aligned right. For example:
* {{{
* year month AVG('Adj Close) MAX('Adj Close)
* 1980 12 0.503218 0.595103
* 1981 01 0.523289 0.570307
* 1982 02 0.436504 0.475256
* 1983 03 0.410516 0.442194
* 1984 04 0.450090 0.483521
* }}}
* @param numRows Number of rows to show
* @param truncate Whether truncate long strings. If true, strings more than 20 characters will
* be truncated and all cells will be aligned right
*
* Differs from `Dataset#show` by wrapping its result into an effect-suspending `F[_]`.
*
* apache/spark
*/
def show[F[_]](numRows: Int = 20, truncate: Boolean = true)(implicit F: SparkDelay[F]): F[Unit] =
F.delay(dataset.show(numRows, truncate))
/** Returns a new [[frameless.TypedDataset]] that only contains elements where `column` is `true`.
*
* Differs from `TypedDatasetForward#filter` by taking a `TypedColumn[T, Boolean]` instead of a
* `T => Boolean`. Using a column expression instead of a regular function save one Spark → Scala
* deserialization which leads to better performance.
*/
def filter(column: TypedColumn[T, Boolean]): TypedDataset[T] = {
val filtered = dataset.toDF()
.filter(column.untyped)
.as[T](TypedExpressionEncoder[T])
TypedDataset.create[T](filtered)
}
/** Runs `func` on each element of this [[TypedDataset]].
*
* Differs from `Dataset#foreach` by wrapping its result into an effect-suspending `F[_]`.
*/
def foreach[F[_]](func: T => Unit)(implicit F: SparkDelay[F]): F[Unit] =
F.delay(dataset.foreach(func))
/** Runs `func` on each partition of this [[TypedDataset]].
*
* Differs from `Dataset#foreachPartition` by wrapping its result into an effect-suspending `F[_]`.
*/
def foreachPartition[F[_]](func: Iterator[T] => Unit)(implicit F: SparkDelay[F]): F[Unit] =
F.delay(dataset.foreachPartition(func))
/**
* Create a multi-dimensional cube for the current [[TypedDataset]] using the specified column,
* so we can run aggregation on it.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
def cube[K1](
c1: TypedColumn[T, K1]
): Cube1Ops[K1, T] = new Cube1Ops[K1, T](this, c1)
/**
* Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns,
* so we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
def cube[K1, K2](
c1: TypedColumn[T, K1],
c2: TypedColumn[T, K2]
): Cube2Ops[K1, K2, T] = new Cube2Ops[K1, K2, T](this, c1, c2)
/**
* Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns,
* so we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* {{{
* case class MyClass(a: Int, b: Int, c: Int)
* val ds: TypedDataset[MyClass]
* val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
* ds.cubeMany(ds('a), ds('b)).agg(count[MyClass]())
*
* // original dataset:
* a b c
* 10 20 1
* 15 25 2
*
* // after aggregation:
* _1 _2 _3
* 15 null 1
* 15 25 1
* null null 2
* null 25 1
* null 20 1
* 10 null 1
* 10 20 1
*
* }}}
*
* Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
object cubeMany extends ProductArgs {
def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
(implicit
i0: ColumnTypes.Aux[T, TK, K],
i1: Tupler.Aux[K, KT],
i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
): CubeManyOps[T, TK, K, KT] = new CubeManyOps[T, TK, K, KT](self, groupedBy)
}
/**
* Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* apache/spark
*/
def groupBy[K1](
c1: TypedColumn[T, K1]
): GroupedBy1Ops[K1, T] = new GroupedBy1Ops[K1, T](this, c1)
/**
* Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* apache/spark
*/
def groupBy[K1, K2](
c1: TypedColumn[T, K1],
c2: TypedColumn[T, K2]
): GroupedBy2Ops[K1, K2, T] = new GroupedBy2Ops[K1, K2, T](this, c1, c2)
/**
* Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* {{{
* case class MyClass(a: Int, b: Int, c: Int)
* val ds: TypedDataset[MyClass]
*
* val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
* ds.groupByMany(ds('a), ds('b)).agg(count[MyClass]())
*
* // original dataset:
* a b c
* 10 20 1
* 15 25 2
*
* // after aggregation:
* _1 _2 _3
* 10 20 1
* 15 25 1
*
* }}}
*
* apache/spark
*/
object groupByMany extends ProductArgs {
def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
(implicit
i0: ColumnTypes.Aux[T, TK, K],
i1: Tupler.Aux[K, KT],
i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
): GroupedByManyOps[T, TK, K, KT] = new GroupedByManyOps[T, TK, K, KT](self, groupedBy)
}
/**
* Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified column,
* so we can run aggregation on it.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
def rollup[K1](
c1: TypedColumn[T, K1]
): Rollup1Ops[K1, T] = new Rollup1Ops[K1, T](this, c1)
/**
* Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns,
* so we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
def rollup[K1, K2](
c1: TypedColumn[T, K1],
c2: TypedColumn[T, K2]
): Rollup2Ops[K1, K2, T] = new Rollup2Ops[K1, K2, T](this, c1, c2)
/**
* Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns,
* so we can run aggregation on them.
* See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
*
* {{{
* case class MyClass(a: Int, b: Int, c: Int)
* val ds: TypedDataset[MyClass]
*
* val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
* ds.rollupMany(ds('a), ds('b)).agg(count[MyClass]())
*
* // original dataset:
* a b c
* 10 20 1
* 15 25 2
*
* // after aggregation:
* _1 _2 _3
* 15 null 1
* 15 25 1
* null null 2
* 10 null 1
* 10 20 1
*
* }}}
*
* Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
*
* apache/spark
*/
object rollupMany extends ProductArgs {
def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
(implicit
i0: ColumnTypes.Aux[T, TK, K],
i1: Tupler.Aux[K, KT],
i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
): RollupManyOps[T, TK, K, KT] = new RollupManyOps[T, TK, K, KT](self, groupedBy)
}
/** Computes the cartesian project of `this` `Dataset` with the `other` `Dataset` */
def joinCross[U](other: TypedDataset[U])
(implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] =
new TypedDataset(self.dataset.joinWith(other.dataset, new Column(Literal(true)), "cross"))
/** Computes the full outer join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinFull[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
(implicit e: TypedEncoder[(Option[T], Option[U])]): TypedDataset[(Option[T], Option[U])] =
new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "full")
.as[(Option[T], Option[U])](TypedExpressionEncoder[(Option[T], Option[U])]))
/** Computes the inner join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinInner[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
(implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = {
import FramelessInternals._
val leftPlan = logicalPlan(dataset)
val rightPlan = logicalPlan(other.dataset)
val join = disambiguate(Join(leftPlan, rightPlan, Inner, Some(condition.expr), JoinHint.NONE))
val joinedPlan = joinPlan(dataset, join, leftPlan, rightPlan)
val joinedDs = mkDataset(dataset.sqlContext, joinedPlan, TypedExpressionEncoder[(T, U)])
TypedDataset.create[(T, U)](joinedDs)
}
/** Computes the left outer join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinLeft[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
(implicit e: TypedEncoder[(T, Option[U])]): TypedDataset[(T, Option[U])] =
new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "left_outer")
.as[(T, Option[U])](TypedExpressionEncoder[(T, Option[U])]))
/** Computes the left semi join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinLeftSemi[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] =
new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftsemi")
.as[T](TypedExpressionEncoder(encoder)))
/** Computes the left anti join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinLeftAnti[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] =
new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftanti")
.as[T](TypedExpressionEncoder(encoder)))
/** Computes the right outer join of `this` `Dataset` with the `other` `Dataset`,
* returning a `Tuple2` for each pair where condition evaluates to true.
*/
def joinRight[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
(implicit e: TypedEncoder[(Option[T], U)]): TypedDataset[(Option[T], U)] =
new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "right_outer")
.as[(Option[T], U)](TypedExpressionEncoder[(Option[T], U)]))
private def disambiguate(join: Join): Join = {
val plan = FramelessInternals.ofRows(dataset.sparkSession, join).queryExecution.analyzed.asInstanceOf[Join]
val disambiguated = plan.condition.map(_.transform {
case FramelessInternals.DisambiguateLeft(tagged: AttributeReference) =>
val leftDs = FramelessInternals.ofRows(spark, plan.left)
FramelessInternals.resolveExpr(leftDs, Seq(tagged.name))
case FramelessInternals.DisambiguateRight(tagged: AttributeReference) =>
val rightDs = FramelessInternals.ofRows(spark, plan.right)
FramelessInternals.resolveExpr(rightDs, Seq(tagged.name))
case x => x
})
plan.copy(condition = disambiguated)
}
/** Takes a function from A => R and converts it to a UDF for TypedColumn[T, A] => TypedColumn[T, R].
*/
def makeUDF[A: TypedEncoder, R: TypedEncoder](f: A => R):
TypedColumn[T, A] => TypedColumn[T, R] = functions.udf(f)
/** Takes a function from (A1, A2) => R and converts it to a UDF for
* (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R].
*/
def makeUDF[A1: TypedEncoder, A2: TypedEncoder, R: TypedEncoder](f: (A1, A2) => R):
(TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = functions.udf(f)
/** Takes a function from (A1, A2, A3) => R and converts it to a UDF for
* (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R].
*/
def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3) => R):
(TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = functions.udf(f)
/** Takes a function from (A1, A2, A3, A4) => R and converts it to a UDF for
* (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R].
*/
def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4) => R):
(TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = functions.udf(f)
/** Takes a function from (A1, A2, A3, A4, A5) => R and converts it to a UDF for
* (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R].
*/
def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, A5: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4, A5) => R):
(TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = functions.udf(f)
/** Type-safe projection from type T to Tuple1[A]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A](
ca: TypedColumn[T, A]
): TypedDataset[A] = {
implicit val ea = ca.uencoder
val tuple1: TypedDataset[Tuple1[A]] = selectMany(ca)
// now we need to unpack `Tuple1[A]` to `A`
TypedEncoder[A].catalystRepr match {
case StructType(_) =>
// if column is struct, we use all its fields
val df = tuple1
.dataset
.selectExpr("_1.*")
.as[A](TypedExpressionEncoder[A])
TypedDataset.create(df)
case other =>
// for primitive types `Tuple1[A]` has the same schema as `A`
TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A]))
}
}
/** Type-safe projection from type T to Tuple2[A,B]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B]
): TypedDataset[(A, B)] = {
implicit val (ea, eb) = (ca.uencoder, cb.uencoder)
selectMany(ca, cb)
}
/** Type-safe projection from type T to Tuple3[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C]
): TypedDataset[(A, B, C)] = {
implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder)
selectMany(ca, cb, cc)
}
/** Type-safe projection from type T to Tuple4[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D]
): TypedDataset[(A, B, C, D)] = {
implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder)
selectMany(ca, cb, cc, cd)
}
/** Type-safe projection from type T to Tuple5[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E]
): TypedDataset[(A, B, C, D, E)] = {
implicit val (ea, eb, ec, ed, ee) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder)
selectMany(ca, cb, cc, cd, ce)
}
/** Type-safe projection from type T to Tuple6[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E, F](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E],
cf: TypedColumn[T, F]
): TypedDataset[(A, B, C, D, E, F)] = {
implicit val (ea, eb, ec, ed, ee, ef) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder)
selectMany(ca, cb, cc, cd, ce, cf)
}
/** Type-safe projection from type T to Tuple7[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E, F, G](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E],
cf: TypedColumn[T, F],
cg: TypedColumn[T, G]
): TypedDataset[(A, B, C, D, E, F, G)] = {
implicit val (ea, eb, ec, ed, ee, ef, eg) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder)
selectMany(ca, cb, cc, cd, ce, cf, cg)
}
/** Type-safe projection from type T to Tuple8[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E, F, G, H](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E],
cf: TypedColumn[T, F],
cg: TypedColumn[T, G],
ch: TypedColumn[T, H]
): TypedDataset[(A, B, C, D, E, F, G, H)] = {
implicit val (ea, eb, ec, ed, ee, ef, eg, eh) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder)
selectMany(ca, cb, cc, cd, ce, cf, cg, ch)
}
/** Type-safe projection from type T to Tuple9[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E, F, G, H, I](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E],
cf: TypedColumn[T, F],
cg: TypedColumn[T, G],
ch: TypedColumn[T, H],
ci: TypedColumn[T, I]
): TypedDataset[(A, B, C, D, E, F, G, H, I)] = {
implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder)
selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci)
}
/** Type-safe projection from type T to Tuple10[A,B,...]
* {{{
* d.select( d('a), d('a)+d('b), ... )
* }}}
*/
def select[A, B, C, D, E, F, G, H, I, J](
ca: TypedColumn[T, A],
cb: TypedColumn[T, B],
cc: TypedColumn[T, C],
cd: TypedColumn[T, D],
ce: TypedColumn[T, E],
cf: TypedColumn[T, F],
cg: TypedColumn[T, G],
ch: TypedColumn[T, H],
ci: TypedColumn[T, I],
cj: TypedColumn[T, J]
): TypedDataset[(A, B, C, D, E, F, G, H, I, J)] = {
implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei, ej) =
(ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder, cj.uencoder)
selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci, cj)
}
object selectMany extends ProductArgs {
def applyProduct[U <: HList, Out0 <: HList, Out](columns: U)
(implicit
i0: ColumnTypes.Aux[T, U, Out0],
i1: ToTraversable.Aux[U, List, UntypedExpression[T]],
i2: Tupler.Aux[Out0, Out],
i3: TypedEncoder[Out]
): TypedDataset[Out] = {
val base = dataset.toDF()
.select(columns.toList[UntypedExpression[T]].map(c => new Column(c.expr)):_*)
val selected = base.as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](selected)
}
}
/** Sort each partition in the dataset using the columns selected. */
def sortWithinPartitions[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] =
sortWithinPartitionsMany(ca)
/** Sort each partition in the dataset using the columns selected. */
def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered](
ca: SortedTypedColumn[T, A],
cb: SortedTypedColumn[T, B]
): TypedDataset[T] = sortWithinPartitionsMany(ca, cb)
/** Sort each partition in the dataset using the columns selected. */
def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered](
ca: SortedTypedColumn[T, A],
cb: SortedTypedColumn[T, B],
cc: SortedTypedColumn[T, C]
): TypedDataset[T] = sortWithinPartitionsMany(ca, cb, cc)
/** Sort each partition in the dataset by the given column expressions
* Default sort order is ascending.
* {{{
* d.sortWithinPartitionsMany(d('a), d('b).desc, d('c).asc)
* }}}
*/
object sortWithinPartitionsMany extends ProductArgs {
def applyProduct[U <: HList, O <: HList](columns: U)
(implicit
i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O],
i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]]
): TypedDataset[T] = {
val sorted = dataset.toDF()
.sortWithinPartitions(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*)
.as[T](TypedExpressionEncoder[T])
TypedDataset.create[T](sorted)
}
}
/** Orders the TypedDataset using the column selected. */
def orderBy[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] =
orderByMany(ca)
/** Orders the TypedDataset using the columns selected. */
def orderBy[A: CatalystOrdered, B: CatalystOrdered](
ca: SortedTypedColumn[T, A],
cb: SortedTypedColumn[T, B]
): TypedDataset[T] = orderByMany(ca, cb)
/** Orders the TypedDataset using the columns selected. */
def orderBy[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered](
ca: SortedTypedColumn[T, A],
cb: SortedTypedColumn[T, B],
cc: SortedTypedColumn[T, C]
): TypedDataset[T] = orderByMany(ca, cb, cc)
/** Sort the dataset by any number of column expressions.
* Default sort order is ascending.
* {{{
* d.orderByMany(d('a), d('b).desc, d('c).asc)
* }}}
*/
object orderByMany extends ProductArgs {
def applyProduct[U <: HList, O <: HList](columns: U)
(implicit
i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O],
i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]]
): TypedDataset[T] = {
val sorted = dataset.toDF()
.orderBy(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*)
.as[T](TypedExpressionEncoder[T])
TypedDataset.create[T](sorted)
}
}
/** Returns a new Dataset as a tuple with the specified
* column dropped.
* Does not allow for dropping from a single column TypedDataset
*
* {{{
* val d: TypedDataset[Foo(a: String, b: Int...)] = ???
* val result = TypedDataset[(Int, ...)] = d.drop('a)
* }}}
* @param column column to drop specified as a Symbol
* @param i0 LabelledGeneric derived for T
* @param i1 Remover derived for TRep and column
* @param i2 values of T with column removed
* @param i3 tupler of values
* @param i4 evidence of encoder of the tupled values
* @tparam Out Tupled return type
* @tparam TRep shapeless' record representation of T
* @tparam Removed record of T with column removed
* @tparam ValuesFromRemoved values of T with column removed as an HList
* @tparam V value type of column in T
* @return
*/
def dropTupled[Out, TRep <: HList, Removed <: HList, ValuesFromRemoved <: HList, V]
(column: Witness.Lt[Symbol])
(implicit
i0: LabelledGeneric.Aux[T, TRep],
i1: Remover.Aux[TRep, column.T, (V, Removed)],
i2: Values.Aux[Removed, ValuesFromRemoved],
i3: Tupler.Aux[ValuesFromRemoved, Out],
i4: TypedEncoder[Out]
): TypedDataset[Out] = {
val dropped = dataset
.toDF()
.drop(column.value.name)
.as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](dropped)
}
/**
* Drops columns as necessary to return `U`
*
* @example
* {{{
* case class X(i: Int, j: Int, k: Boolean)
* case class Y(i: Int, k: Boolean)
* val f: TypedDataset[X] = ???
* val fNew: TypedDataset[Y] = f.drop[Y]
* }}}
*
* @tparam U the output type
*
* @see [[frameless.TypedDataset#project]]
*/
def drop[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = project[U]
/** Prepends a new column to the Dataset.
*
* {{{
* case class X(i: Int, j: Int)
* val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
* val fNew: TypedDataset[(Int,Int,Boolean)] = f.withColumnTupled(f('j) === 10)
* }}}
*/
def withColumnTupled[A: TypedEncoder, H <: HList, FH <: HList, Out]
(ca: TypedColumn[T, A])
(implicit
i0: Generic.Aux[T, H],
i1: Prepend.Aux[H, A :: HNil, FH],
i2: Tupler.Aux[FH, Out],
i3: TypedEncoder[Out]
): TypedDataset[Out] = {
// Giving a random name to the new column (the proper name will be given by the Tuple-based encoder)
val selected = dataset.toDF().withColumn("I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI", ca.untyped)
.as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](selected)
}
/** Returns a new [[frameless.TypedDataset]] with the specified column updated with a new value
* {{{
* case class X(i: Int, j: Int)
* val f: TypedDataset[X] = TypedDataset.create(X(1,10) :: Nil)
* val fNew: TypedDataset[X] = f.withColumn('j, f('i)) // results in X(1, 1) :: Nil
* }}}
* @param column column given as a symbol to replace
* @param replacement column to replace the value with
* @param i0 Evidence that a column with the correct type and name exists
*/
def withColumnReplaced[A](
column: Witness.Lt[Symbol],
replacement: TypedColumn[T, A]
)(implicit
i0: TypedColumn.Exists[T, column.T, A]
): TypedDataset[T] = {
val updated = dataset.toDF().withColumn(column.value.name, replacement.untyped)
.as[T](TypedExpressionEncoder[T])
TypedDataset.create[T](updated)
}
/** Adds a column to a Dataset so long as the specified output type, `U`, has
* an extra column from `T` that has type `A`.
*
* @example
* {{{
* case class X(i: Int, j: Int)
* case class Y(i: Int, j: Int, k: Boolean)
* val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
* val fNew: TypedDataset[Y] = f.withColumn[Y](f('j) === 10)
* }}}
* @param ca The typed column to add
* @param i0 TypeEncoder for output type U
* @param i1 TypeEncoder for added column type A
* @param i2 the LabelledGeneric derived for T
* @param i3 the LabelledGeneric derived for U
* @param i4 proof no fields have been removed
* @param i5 diff from T to U
* @param i6 keys from newFields
* @param i7 the one and only new key
* @param i8 the one and only new field enforcing the type of A exists
* @param i9 the keys of U
* @param iA allows for traversing the keys of U
* @tparam U the output type
* @tparam A The added column type
* @tparam TRep shapeless' record representation of T
* @tparam URep shapeless' record representation of U
* @tparam UKeys the keys of U as an HList
* @tparam NewFields the added fields to T to get U
* @tparam NewKeys the keys of NewFields as an HList
* @tparam NewKey the first, and only, key in NewKey
*
* @see [[frameless.TypedDataset.WithColumnApply#apply]]
*/
def withColumn[U] = new WithColumnApply[U]
class WithColumnApply[U] {
def apply[A, TRep <: HList, URep <: HList, UKeys <: HList, NewFields <: HList, NewKeys <: HList, NewKey <: Symbol]
(ca: TypedColumn[T, A])
(implicit
i0: TypedEncoder[U],
i1: TypedEncoder[A],
i2: LabelledGeneric.Aux[T, TRep],
i3: LabelledGeneric.Aux[U, URep],
i4: Diff.Aux[TRep, URep, HNil],
i5: Diff.Aux[URep, TRep, NewFields],
i6: Keys.Aux[NewFields, NewKeys],
i7: IsHCons.Aux[NewKeys, NewKey, HNil],
i8: IsHCons.Aux[NewFields, FieldType[NewKey, A], HNil],
i9: Keys.Aux[URep, UKeys],
iA: ToTraversable.Aux[UKeys, Seq, Symbol]
): TypedDataset[U] = {
val newColumnName =
i7.head(i6()).name
val dfWithNewColumn = dataset
.toDF()
.withColumn(newColumnName, ca.untyped)
val newColumns = i9.apply().to[Seq].map(_.name).map(dfWithNewColumn.col)
val selected = dfWithNewColumn
.select(newColumns: _*)
.as[U](TypedExpressionEncoder[U])
TypedDataset.create[U](selected)
}
}
/**
* Explodes a single column at a time. It only compiles if the type of column supports this operation.
*
* @example
*
* {{{
* case class X(i: Int, j: Array[Int])
* case class Y(i: Int, j: Int)
*
* val f: TypedDataset[X] = ???
* val fNew: TypedDataset[Y] = f.explode('j).as[Y]
* }}}
* @param column the column we wish to explode
*/
def explode[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out]
(column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, V[A]],
i1: TypedEncoder[A],
i2: CatalystExplodableCollection[V],
i3: LabelledGeneric.Aux[T, TRep],
i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod],
i5: Values.Aux[OutMod, OutModValues],
i6: Tupler.Aux[OutModValues, Out],
i7: TypedEncoder[Out]
): TypedDataset[Out] = {
import org.apache.spark.sql.functions.{explode => sparkExplode}
val df = dataset.toDF()
val trans =
df
.withColumn(column.value.name, sparkExplode(df(column.value.name)))
.as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](trans)
}
/**
* Explodes a single column at a time. It only compiles if the type of column supports this operation.
*
* @example
*
* {{{
* case class X(i: Int, j: Map[Int, Int])
* case class Y(i: Int, j: (Int, Int))
*
* val f: TypedDataset[X] = ???
* val fNew: TypedDataset[Y] = f.explodeMap('j).as[Y]
* }}}
* @param column the column we wish to explode
*/
def explodeMap[A, B, V[_, _], TRep <: HList, OutMod <: HList, OutModValues <: HList, Out]
(column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, V[A, B]],
i1: TypedEncoder[A],
i2: TypedEncoder[B],
i3: LabelledGeneric.Aux[T, TRep],
i4: Modifier.Aux[TRep, column.T, V[A,B], (A, B), OutMod],
i5: Values.Aux[OutMod, OutModValues],
i6: Tupler.Aux[OutModValues, Out],
i7: TypedEncoder[Out]
): TypedDataset[Out] = {
import org.apache.spark.sql.functions.{explode => sparkExplode, struct => sparkStruct, col => sparkCol}
val df = dataset.toDF()
// select all columns, all original columns and [key, value] columns appeared after the map explode
// .withColumn(column.value.name, sparkExplode(df(column.value.name))) in this case would not work
// since the map explode produces two columns
val columnNames = df.columns.toSeq
val columnNamesRenamed = columnNames.map(c => s"frameless_$c")
// preserve the original list of renamed columns
val columns = columnNamesRenamed.map(sparkCol)
val columnRenamed = s"frameless_${column.value.name}"
// explode of a map adds "key" and "value" columns into the Row
// this may cause col namings collision: row could already contain key / value columns
// we rename the original Row columns to avoid this collision
val dfr = df.toDF(columnNamesRenamed: _*)
val exploded = dfr.select(sparkCol("*"), sparkExplode(dfr(columnRenamed)))
val trans =
exploded
// map explode explodes it into [key, value] columns
// the only way to put it into a column is to create a struct
.withColumn(columnRenamed, sparkStruct(exploded("key"), exploded("value")))
// selecting only original columns, we don't need [key, value] columns left in the DataFrame after the map explode
.select(columns: _*)
// rename columns back and form the result
.toDF(columnNames: _*)
.as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](trans)
}
/**
* Flattens a column of type Option[A]. Compiles only if the selected column is of type Option[A].
*
*
* @example
*
* {{{
* case class X(i: Int, j: Option[Int])
* case class Y(i: Int, j: Int)
*
* val f: TypedDataset[X] = ???
* val fNew: TypedDataset[Y] = f.flattenOption('j).as[Y]
* }}}
*
* @param column the column we wish to flatten
*/
def flattenOption[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out]
(column: Witness.Lt[Symbol])
(implicit
i0: TypedColumn.Exists[T, column.T, V[A]],
i1: TypedEncoder[A],
i2: V[A] =:= Option[A],
i3: LabelledGeneric.Aux[T, TRep],
i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod],
i5: Values.Aux[OutMod, OutModValues],
i6: Tupler.Aux[OutModValues, Out],
i7: TypedEncoder[Out]
): TypedDataset[Out] = {
val df = dataset.toDF()
val trans = df.filter(df(column.value.name).isNotNull).
as[Out](TypedExpressionEncoder[Out])
TypedDataset.create[Out](trans)
}
}
object TypedDataset {
def create[A](data: Seq[A])
(implicit
encoder: TypedEncoder[A],
sqlContext: SparkSession
): TypedDataset[A] = {
val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A])
TypedDataset.create[A](dataset)
}
def create[A](data: RDD[A])
(implicit
encoder: TypedEncoder[A],
sqlContext: SparkSession
): TypedDataset[A] = {
val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A])
TypedDataset.create[A](dataset)
}
def create[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] =
createUnsafe(dataset.toDF())
/**
* Creates a [[frameless.TypedDataset]] from a Spark [[org.apache.spark.sql.DataFrame]].
* Note that the names and types need to align!
*
* This is an unsafe operation: If the schemas do not align,
* the error will be captured at runtime (not during compilation).
*/
def createUnsafe[A: TypedEncoder](df: DataFrame): TypedDataset[A] = {
val e = TypedEncoder[A]
val output: Seq[Attribute] = df.queryExecution.analyzed.output
val targetFields = TypedExpressionEncoder.targetStructType(e)
val targetColNames: Seq[String] = targetFields.map(_.name)
if (output.size != targetFields.size) {
throw new IllegalStateException(
s"Unsupported creation of TypedDataset with ${targetFields.size} column(s) " +
s"from a DataFrame with ${output.size} columns. " +
"Try to `select()` the proper columns in the right order before calling `create()`.")
}
// Adapt names if they are not the same (note: types still might not match)
val shouldReshape = output.zip(targetColNames).exists {
case (expr, colName) => expr.name != colName
}
val canSelect = targetColNames.toSet.subsetOf(output.map(_.name).toSet)
val reshaped = if (shouldReshape && canSelect) {
df.select(targetColNames.head, targetColNames.tail:_*)
} else if (shouldReshape) {
df.toDF(targetColNames: _*)
} else {
df
}
new TypedDataset[A](reshaped.as[A](TypedExpressionEncoder[A]))
}
/** Prefer `TypedDataset.create` over `TypedDataset.unsafeCreate` unless you
* know what you are doing. */
@deprecated("Prefer TypedDataset.create over TypedDataset.unsafeCreate", "0.3.0")
def unsafeCreate[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = {
new TypedDataset[A](dataset)
}
}
================================================
FILE: dataset/src/main/scala/frameless/TypedDatasetForwarded.scala
================================================
package frameless
import java.util
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.streaming.DataStreamWriter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameWriter, SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import scala.util.Random
/** This trait implements [[TypedDataset]] methods that have the same signature
* than their `Dataset` equivalent. Each method simply forwards the call to the
* underlying `Dataset`.
*
* Documentation marked "apache/spark" is thanks to apache/spark Contributors
* at https://github.com/apache/spark, licensed under Apache v2.0 available at
* http://www.apache.org/licenses/LICENSE-2.0
*/
trait TypedDatasetForwarded[T] { self: TypedDataset[T] =>
override def toString: String =
dataset.toString
/**
* Returns a `SparkSession` from this [[TypedDataset]].
*/
def sparkSession: SparkSession =
dataset.sparkSession
/**
* Returns a `SQLContext` from this [[TypedDataset]].
*/
def sqlContext: SQLContext =
dataset.sqlContext
/**
* Returns the schema of this Dataset.
*
* apache/spark
*/
def schema: StructType =
dataset.schema
/** Prints the schema of the underlying `Dataset` to the console in a nice tree format.
*
* apache/spark
*/
def printSchema(): Unit =
dataset.printSchema()
/** Prints the plans (logical and physical) to the console for debugging purposes.
*
* apache/spark
*/
def explain(extended: Boolean = false): Unit =
dataset.explain(extended)
/**
* Returns a `QueryExecution` from this [[TypedDataset]].
*
* It is the primary workflow for executing relational queries using Spark. Designed to allow easy
* access to the intermediate phases of query execution for developers.
*
* apache/spark
*/
def queryExecution: QueryExecution =
dataset.queryExecution
/** Converts this strongly typed collection of data to generic Dataframe. In contrast to the
* strongly typed objects that Dataset operations work on, a Dataframe returns generic Row
* objects that allow fields to be accessed by ordinal or name.
*
* apache/spark
*/
def toDF(): DataFrame =
dataset.toDF()
/** Converts this [[TypedDataset]] to an RDD.
*
* apache/spark
*/
def rdd: RDD[T] =
dataset.rdd
/** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions.
*
* apache/spark
*/
def repartition(numPartitions: Int): TypedDataset[T] =
TypedDataset.create(dataset.repartition(numPartitions))
/**
* Get the [[TypedDataset]]'s current storage level, or StorageLevel.NONE if not persisted.
*
* apache/spark
*/
def storageLevel(): StorageLevel =
dataset.storageLevel
/**
* Returns the content of the [[TypedDataset]] as a Dataset of JSON strings.
*
* apache/spark
*/
def toJSON: TypedDataset[String] =
TypedDataset.create(dataset.toJSON)
/**
* Interface for saving the content of the non-streaming [[TypedDataset]] out into external storage.
*
* apache/spark
*/
def write: DataFrameWriter[T] =
dataset.write
/**
* Interface for saving the content of the streaming Dataset out into external storage.
*
* apache/spark
*/
def writeStream: DataStreamWriter[T] =
dataset.writeStream
/** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions.
* Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g.
* if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
* the 100 new partitions will claim 10 of the current partitions.
*
* apache/spark
*/
def coalesce(numPartitions: Int): TypedDataset[T] =
TypedDataset.create(dataset.coalesce(numPartitions))
/**
* Returns an `Array` that contains all column names in this [[TypedDataset]].
*/
def columns: Array[String] =
dataset.columns
/** Concise syntax for chaining custom transformations.
*
* apache/spark
*/
def transform[U](t: TypedDataset[T] => TypedDataset[U]): TypedDataset[U] =
t(this)
/** Returns a new Dataset by taking the first `n` rows. The difference between this function
* and `head` is that `head` is an action and returns an array (by triggering query execution)
* while `limit` returns a new Dataset.
*
* apache/spark
*/
def limit(n: Int): TypedDataset[T] =
TypedDataset.create(dataset.limit(n))
/** Returns a new [[TypedDataset]] by sampling a fraction of records.
*
* apache/spark
*/
def sample(withReplacement: Boolean, fraction: Double, seed: Long = Random.nextLong()): TypedDataset[T] =
TypedDataset.create(dataset.sample(withReplacement, fraction, seed))
/** Returns a new [[TypedDataset]] that contains only the unique elements of this [[TypedDataset]].
*
* Note that, equality checking is performed directly on the encoded representation of the data
* and thus is not affected by a custom `equals` function defined on `T`.
*
* apache/spark
*/
def distinct: TypedDataset[T] =
TypedDataset.create(dataset.distinct())
/**
* Returns a best-effort snapshot of the files that compose this [[TypedDataset]]. This method simply
* asks each constituent BaseRelation for its respective files and takes the union of all results.
* Depending on the source relations, this may not find all input files. Duplicates are removed.
*
* apache/spark
*/
def inputFiles: Array[String] =
dataset.inputFiles
/**
* Returns true if the `collect` and `take` methods can be run locally
* (without any Spark executors).
*
* apache/spark
*/
def isLocal: Boolean =
dataset.isLocal
/**
* Returns true if this [[TypedDataset]] contains one or more sources that continuously
* return data as it arrives. A [[TypedDataset]] that reads data from a streaming source
* must be executed as a `StreamingQuery` using the `start()` method in
* `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or
* `collect()`, will throw an `AnalysisException` when there is a streaming
* source present.
*
* apache/spark
*/
def isStreaming: Boolean =
dataset.isStreaming
/** Returns a new [[TypedDataset]] that contains only the elements of this [[TypedDataset]] that are also
* present in `other`.
*
* Note that, equality checking is performed directly on the encoded representation of the data
* and thus is not affected by a custom `equals` function defined on `T`.
*
* apache/spark
*/
def intersect(other: TypedDataset[T]): TypedDataset[T] =
TypedDataset.create(dataset.intersect(other.dataset))
/**
* Randomly splits this [[TypedDataset]] with the provided weights.
* Weights for splits, will be normalized if they don't sum to 1.
*
* apache/spark
*/
// $COVERAGE-OFF$ We can not test this method because it is non-deterministic.
def randomSplit(weights: Array[Double]): Array[TypedDataset[T]] =
dataset.randomSplit(weights).map(TypedDataset.create[T])
// $COVERAGE-ON$
/**
* Randomly splits this [[TypedDataset]] with the provided weights.
* Weights for splits, will be normalized if they don't sum to 1.
*
* apache/spark
*/
def randomSplit(weights: Array[Double], seed: Long): Array[TypedDataset[T]] =
dataset.randomSplit(weights, seed).map(TypedDataset.create[T])
/**
* Returns a Java list that contains randomly split [[TypedDataset]] with the provided weights.
* Weights for splits, will be normalized if they don't sum to 1.
*
* apache/spark
*/
def randomSplitAsList(weights: Array[Double], seed: Long): util.List[TypedDataset[T]] = {
val values = randomSplit(weights, seed)
java.util.Arrays.asList(values: _*)
}
/** Returns a new Dataset containing rows in this Dataset but not in another Dataset.
* This is equivalent to `EXCEPT` in SQL.
*
* Note that, equality checking is performed directly on the encoded representation of the data
* and thus is not affected by a custom `equals` function defined on `T`.
*
* apache/spark
*/
def except(other: TypedDataset[T]): TypedDataset[T] =
TypedDataset.create(dataset.except(other.dataset))
/** Persist this [[TypedDataset]] with the default storage level (`MEMORY_AND_DISK`).
*
* apache/spark
*/
def cache(): TypedDataset[T] =
TypedDataset.create(dataset.cache())
/** Persist this [[TypedDataset]] with the given storage level.
* @param newLevel One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`,
* `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc.
*
* apache/spark
*/
def persist(newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): TypedDataset[T] =
TypedDataset.create(dataset.persist(newLevel))
/** Mark the [[TypedDataset]] as non-persistent, and remove all blocks for it from memory and disk.
* @param blocking Whether to block until all blocks are deleted.
*
* apache/spark
*/
def unpersist(blocking: Boolean = false): TypedDataset[T] =
TypedDataset.create(dataset.unpersist(blocking))
// $COVERAGE-OFF$ We do not test deprecated method since forwarded methods are tested.
@deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
def map[U: TypedEncoder](func: T => U): TypedDataset[U] =
dese
gitextract_u5s1eutc/
├── .github/
│ ├── release-drafter.yml
│ └── workflows/
│ ├── ci.yml
│ ├── clean.yml
│ └── release-drafter.yml
├── .gitignore
├── .scalafmt.conf
├── LICENSE
├── README.md
├── build.sbt
├── cats/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ └── frameless/
│ │ └── cats/
│ │ ├── FramelessSyntax.scala
│ │ ├── SparkDelayInstances.scala
│ │ ├── SparkTask.scala
│ │ ├── implicits.scala
│ │ └── package.scala
│ └── test/
│ ├── resources/
│ │ ├── log4j.properties
│ │ └── log4j2.properties
│ └── scala/
│ └── frameless/
│ └── cats/
│ ├── FramelessSyntaxTests.scala
│ └── test.scala
├── core/
│ └── src/
│ └── main/
│ └── scala/
│ └── frameless/
│ ├── CatalystAverageable.scala
│ ├── CatalystBitShift.scala
│ ├── CatalystBitwise.scala
│ ├── CatalystCast.scala
│ ├── CatalystCollection.scala
│ ├── CatalystDivisible.scala
│ ├── CatalystIsin.scala
│ ├── CatalystNaN.scala
│ ├── CatalystNotNullable.scala
│ ├── CatalystNumeric.scala
│ ├── CatalystNumericWithJavaBigDecimal.scala
│ ├── CatalystOrdered.scala
│ ├── CatalystPivotable.scala
│ ├── CatalystRound.scala
│ ├── CatalystSummable.scala
│ ├── CatalystVariance.scala
│ ├── Injection.scala
│ ├── SQLDate.scala
│ └── SQLTimestamp.scala
├── dataset/
│ └── src/
│ ├── main/
│ │ ├── scala/
│ │ │ ├── frameless/
│ │ │ │ ├── FramelessSyntax.scala
│ │ │ │ ├── InjectionEnum.scala
│ │ │ │ ├── IsValueClass.scala
│ │ │ │ ├── Job.scala
│ │ │ │ ├── RecordEncoder.scala
│ │ │ │ ├── SparkDelay.scala
│ │ │ │ ├── TypedColumn.scala
│ │ │ │ ├── TypedColumnMacroImpl.scala
│ │ │ │ ├── TypedDataset.scala
│ │ │ │ ├── TypedDatasetForwarded.scala
│ │ │ │ ├── TypedEncoder.scala
│ │ │ │ ├── TypedExpressionEncoder.scala
│ │ │ │ ├── With.scala
│ │ │ │ ├── functions/
│ │ │ │ │ ├── AggregateFunctions.scala
│ │ │ │ │ ├── Lit.scala
│ │ │ │ │ ├── NonAggregateFunctions.scala
│ │ │ │ │ ├── Udf.scala
│ │ │ │ │ ├── UnaryFunctions.scala
│ │ │ │ │ └── package.scala
│ │ │ │ ├── ops/
│ │ │ │ │ ├── AggregateTypes.scala
│ │ │ │ │ ├── As.scala
│ │ │ │ │ ├── ColumnTypes.scala
│ │ │ │ │ ├── GroupByOps.scala
│ │ │ │ │ ├── RelationalGroupsOps.scala
│ │ │ │ │ ├── Repeat.scala
│ │ │ │ │ └── SmartProject.scala
│ │ │ │ └── syntax/
│ │ │ │ └── package.scala
│ │ │ └── org/
│ │ │ └── apache/
│ │ │ └── spark/
│ │ │ └── sql/
│ │ │ ├── FramelessInternals.scala
│ │ │ └── reflection/
│ │ │ └── package.scala
│ │ ├── spark-3/
│ │ │ └── frameless/
│ │ │ └── MapGroups.scala
│ │ └── spark-3.4+/
│ │ └── frameless/
│ │ └── MapGroups.scala
│ └── test/
│ ├── resources/
│ │ ├── log4j.properties
│ │ └── log4j2.properties
│ ├── scala/
│ │ ├── frameless/
│ │ │ ├── AsTests.scala
│ │ │ ├── BitwiseTests.scala
│ │ │ ├── CastTests.scala
│ │ │ ├── ColTests.scala
│ │ │ ├── CollectTests.scala
│ │ │ ├── ColumnTests.scala
│ │ │ ├── ColumnViaLambdaTests.scala
│ │ │ ├── CreateTests.scala
│ │ │ ├── DropTest.scala
│ │ │ ├── DropTupledTest.scala
│ │ │ ├── EncoderTests.scala
│ │ │ ├── ExplodeTests.scala
│ │ │ ├── FilterTests.scala
│ │ │ ├── FlattenTests.scala
│ │ │ ├── GroupByTests.scala
│ │ │ ├── InjectionTests.scala
│ │ │ ├── IsValueClassTests.scala
│ │ │ ├── JobTests.scala
│ │ │ ├── JoinTests.scala
│ │ │ ├── LitTests.scala
│ │ │ ├── NumericTests.scala
│ │ │ ├── OrderByTests.scala
│ │ │ ├── RecordEncoderTests.scala
│ │ │ ├── SchemaTests.scala
│ │ │ ├── SelectTests.scala
│ │ │ ├── SelfJoinTests.scala
│ │ │ ├── TypedDatasetSuite.scala
│ │ │ ├── UdtEncodedClass.scala
│ │ │ ├── WithColumnTest.scala
│ │ │ ├── WithColumnTupledTest.scala
│ │ │ ├── XN.scala
│ │ │ ├── forward/
│ │ │ │ ├── CheckpointTests.scala
│ │ │ │ ├── ColumnsTests.scala
│ │ │ │ ├── CountTests.scala
│ │ │ │ ├── DistinctTests.scala
│ │ │ │ ├── ExceptTests.scala
│ │ │ │ ├── FirstTests.scala
│ │ │ │ ├── ForeachTests.scala
│ │ │ │ ├── HeadTests.scala
│ │ │ │ ├── InputFilesTests.scala
│ │ │ │ ├── IntersectTests.scala
│ │ │ │ ├── IsLocalTests.scala
│ │ │ │ ├── IsStreamingTests.scala
│ │ │ │ ├── LimitTests.scala
│ │ │ │ ├── QueryExecutionTests.scala
│ │ │ │ ├── RandomSplitTests.scala
│ │ │ │ ├── SQLContextTests.scala
│ │ │ │ ├── SparkSessionTests.scala
│ │ │ │ ├── StorageLevelTests.scala
│ │ │ │ ├── TakeTests.scala
│ │ │ │ ├── ToJSONTests.scala
│ │ │ │ ├── ToLocalIteratorTests.scala
│ │ │ │ ├── UnionTests.scala
│ │ │ │ ├── WriteStreamTests.scala
│ │ │ │ └── WriteTests.scala
│ │ │ ├── functions/
│ │ │ │ ├── AggregateFunctionsTests.scala
│ │ │ │ ├── DateTimeStringBehaviourUtils.scala
│ │ │ │ ├── DoubleBehaviourUtils.scala
│ │ │ │ ├── NonAggregateFunctionsTests.scala
│ │ │ │ ├── UdfTests.scala
│ │ │ │ └── UnaryFunctionsTest.scala
│ │ │ ├── ops/
│ │ │ │ ├── ColumnTypesTest.scala
│ │ │ │ ├── CubeTests.scala
│ │ │ │ ├── PivotTest.scala
│ │ │ │ ├── RepeatTest.scala
│ │ │ │ ├── RollupTests.scala
│ │ │ │ ├── SmartProjectTest.scala
│ │ │ │ └── deserialized/
│ │ │ │ ├── FilterTests.scala
│ │ │ │ ├── FlatMapTests.scala
│ │ │ │ ├── MapPartitionsTests.scala
│ │ │ │ ├── MapTests.scala
│ │ │ │ └── ReduceTests.scala
│ │ │ ├── package.scala
│ │ │ ├── sql/
│ │ │ │ ├── package.scala
│ │ │ │ └── rules/
│ │ │ │ └── SQLRulesSuite.scala
│ │ │ └── syntax/
│ │ │ └── FramelessSyntaxTests.scala
│ │ └── org/
│ │ └── apache/
│ │ └── hadoop/
│ │ └── fs/
│ │ └── local/
│ │ └── StreamingFS.scala
│ ├── spark-3.2/
│ │ └── frameless/
│ │ └── sql/
│ │ └── rules/
│ │ └── FramelessLitPushDownTests.scala
│ └── spark-3.3+/
│ └── frameless/
│ └── sql/
│ └── rules/
│ └── FramelessLitPushDownTests.scala
├── docs/
│ ├── Cats.md
│ ├── FeatureOverview.md
│ ├── Injection.md
│ ├── Job.md
│ ├── TypedDataFrame.md
│ ├── TypedDatasetVsSparkDataset.md
│ ├── TypedEncoder.md
│ ├── TypedML.md
│ ├── WorkingWithCsvParquetJson.md
│ ├── directory.conf
│ ├── iris.data
│ └── iris.parquet
├── github.sbt
├── ml/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ ├── frameless/
│ │ │ └── ml/
│ │ │ ├── TypedEstimator.scala
│ │ │ ├── TypedTransformer.scala
│ │ │ ├── classification/
│ │ │ │ └── TypedRandomForestClassifier.scala
│ │ │ ├── clustering/
│ │ │ │ ├── TypedBisectingKMeans.scala
│ │ │ │ └── TypedKMeans.scala
│ │ │ ├── feature/
│ │ │ │ ├── TypedIndexToString.scala
│ │ │ │ ├── TypedStringIndexer.scala
│ │ │ │ └── TypedVectorAssembler.scala
│ │ │ ├── internals/
│ │ │ │ ├── LinearInputsChecker.scala
│ │ │ │ ├── SelectorByValue.scala
│ │ │ │ ├── TreesInputsChecker.scala
│ │ │ │ ├── UnaryInputsChecker.scala
│ │ │ │ └── VectorInputsChecker.scala
│ │ │ ├── package.scala
│ │ │ ├── params/
│ │ │ │ ├── kmeans/
│ │ │ │ │ └── KMeansInitMode.scala
│ │ │ │ ├── linears/
│ │ │ │ │ ├── LossStrategy.scala
│ │ │ │ │ └── Solver.scala
│ │ │ │ └── trees/
│ │ │ │ └── FeatureSubsetStrategy.scala
│ │ │ └── regression/
│ │ │ ├── TypedLinearRegression.scala
│ │ │ └── TypedRandomForestRegressor.scala
│ │ └── org/
│ │ └── apache/
│ │ └── spark/
│ │ └── ml/
│ │ └── FramelessInternals.scala
│ └── test/
│ └── scala/
│ └── frameless/
│ └── ml/
│ ├── FramelessMlSuite.scala
│ ├── Generators.scala
│ ├── TypedEncoderInstancesTests.scala
│ ├── classification/
│ │ ├── ClassificationIntegrationTests.scala
│ │ └── TypedRandomForestClassifierTests.scala
│ ├── clustering/
│ │ ├── BisectingKMeansTests.scala
│ │ ├── ClusteringIntegrationTests.scala
│ │ └── KMeansTests.scala
│ ├── feature/
│ │ ├── TypedIndexToStringTests.scala
│ │ ├── TypedStringIndexerTests.scala
│ │ └── TypedVectorAssemblerTests.scala
│ └── regression/
│ ├── RegressionIntegrationTests.scala
│ ├── TypedLinearRegressionTests.scala
│ └── TypedRandomForestRegressorTests.scala
├── project/
│ ├── Common.scala
│ ├── build.properties
│ └── plugins.sbt
├── refined/
│ └── src/
│ ├── main/
│ │ └── scala/
│ │ └── frameless/
│ │ └── refined/
│ │ ├── RefinedFieldEncoders.scala
│ │ └── package.scala
│ └── test/
│ └── scala/
│ └── frameless/
│ └── RefinedFieldEncoderTests.scala
└── scripts/
├── docs-build.sh
├── docs-publish.sh
└── travis-publish.sh
Condensed preview — 206 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (837K chars).
[
{
"path": ".github/release-drafter.yml",
"chars": 585,
"preview": "name-template: 'v$NEXT_MINOR_VERSION'\ntag-template: 'v$NEXT_MINOR_VERSION'\ncategories:\n - title: 'Added'\n labels:\n "
},
{
"path": ".github/workflows/ci.yml",
"chars": 7383,
"preview": "# This file was automatically generated by sbt-github-actions using the\n# githubWorkflowGenerate task. You should add an"
},
{
"path": ".github/workflows/clean.yml",
"chars": 2337,
"preview": "# This file was automatically generated by sbt-github-actions using the\n# githubWorkflowGenerate task. You should add an"
},
{
"path": ".github/workflows/release-drafter.yml",
"chars": 311,
"preview": "name: Release Drafter\n\non:\n push:\n branches:\n - master\n pull_request:\n types: [opened, reopened, synchroniz"
},
{
"path": ".gitignore",
"chars": 347,
"preview": "*.class\n*.log\n\n# sbt specific\n.bsp/\ndist/*\ntarget/\nlib_managed/\nsrc_managed/\nproject/boot/\nproject/plugins/project/\n\n# S"
},
{
"path": ".scalafmt.conf",
"chars": 776,
"preview": "version = 3.8.6\nrunner.dialect = scala213\n\nnewlines.beforeMultilineDef = keep\nnewlines.topLevelStatements = [before]\nnew"
},
{
"path": "LICENSE",
"chars": 11324,
"preview": "Apache License\n Version 2.0, January 2004\n http://www.apache.org/licens"
},
{
"path": "README.md",
"chars": 8377,
"preview": "# Frameless\n\n["
},
{
"path": "build.sbt",
"chars": 14367,
"preview": "val sparkVersion = \"3.5.8\"\nval spark34Version = \"3.4.4\"\nval spark33Version = \"3.3.4\"\nval catsCoreVersion = \"2.13.0\"\nval "
},
{
"path": "cats/src/main/scala/frameless/cats/FramelessSyntax.scala",
"chars": 760,
"preview": "package frameless\npackage cats\n\nimport _root_.cats.effect.Sync\nimport _root_.cats.syntax.all._\nimport _root_.cats.mtl.As"
},
{
"path": "cats/src/main/scala/frameless/cats/SparkDelayInstances.scala",
"chars": 324,
"preview": "package frameless\npackage cats\n\nimport _root_.cats.effect.Sync\nimport org.apache.spark.sql.SparkSession\n\ntrait SparkDela"
},
{
"path": "cats/src/main/scala/frameless/cats/SparkTask.scala",
"chars": 315,
"preview": "package frameless\npackage cats\n\nimport _root_.cats.Id\nimport _root_.cats.data.Kleisli\nimport org.apache.spark.SparkConte"
},
{
"path": "cats/src/main/scala/frameless/cats/implicits.scala",
"chars": 2476,
"preview": "package frameless\npackage cats\n\nimport _root_.cats._\nimport _root_.cats.kernel.{CommutativeMonoid, CommutativeSemigroup}"
},
{
"path": "cats/src/main/scala/frameless/cats/package.scala",
"chars": 186,
"preview": "package frameless\n\nimport _root_.cats.Id\nimport _root_.cats.data.Kleisli\nimport org.apache.spark.SparkContext\n\npackage o"
},
{
"path": "cats/src/test/resources/log4j.properties",
"chars": 9403,
"preview": "log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR\nlog4j.logger.akka.event.slf4j=ERROR\nlog4j.logger.akka.remote.EndpointWri"
},
{
"path": "cats/src/test/resources/log4j2.properties",
"chars": 664,
"preview": "# Set to debug or trace if log4j initialization is failing\nstatus = warn\n\n# Name of the configuration\nname = ConsoleAppe"
},
{
"path": "cats/src/test/scala/frameless/cats/FramelessSyntaxTests.scala",
"chars": 1606,
"preview": "package frameless\npackage cats\n\nimport _root_.cats.data.ReaderT\nimport _root_.cats.effect.IO\nimport _root_.cats.effect.u"
},
{
"path": "cats/src/test/scala/frameless/cats/test.scala",
"chars": 4293,
"preview": "package frameless\npackage cats\n\nimport _root_.cats.Foldable\nimport _root_.cats.syntax.all._\n\nimport org.apache.spark.Spa"
},
{
"path": "core/src/main/scala/frameless/CatalystAverageable.scala",
"chars": 1174,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/**\n * When averaging Spark doesn't change these types:\n "
},
{
"path": "core/src/main/scala/frameless/CatalystBitShift.scala",
"chars": 931,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Spark does not return always Int on shift\n */\n\n@implic"
},
{
"path": "core/src/main/scala/frameless/CatalystBitwise.scala",
"chars": 889,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/**\n * Types that can be bitwise ORed, ANDed, or XORed by C"
},
{
"path": "core/src/main/scala/frameless/CatalystCast.scala",
"chars": 4023,
"preview": "package frameless\n\ntrait CatalystCast[A, B]\n\nobject CatalystCast {\n private[this] val theInstance = new CatalystCast[An"
},
{
"path": "core/src/main/scala/frameless/CatalystCollection.scala",
"chars": 639,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n@implicitNotFound(\"Cannot do collection operations on colum"
},
{
"path": "core/src/main/scala/frameless/CatalystDivisible.scala",
"chars": 1160,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Spark divides everything as Double, expect BigDecimals "
},
{
"path": "core/src/main/scala/frameless/CatalystIsin.scala",
"chars": 755,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Types for which we can check if is in */\n@implicitNotFo"
},
{
"path": "core/src/main/scala/frameless/CatalystNaN.scala",
"chars": 508,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Spark does NaN check only for these types */\n@implicitN"
},
{
"path": "core/src/main/scala/frameless/CatalystNotNullable.scala",
"chars": 677,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n@implicitNotFound(\"Cannot find evidence that type ${A} is n"
},
{
"path": "core/src/main/scala/frameless/CatalystNumeric.scala",
"chars": 916,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Types that can be added, subtracted and multiplied by C"
},
{
"path": "core/src/main/scala/frameless/CatalystNumericWithJavaBigDecimal.scala",
"chars": 1331,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Spark does not return always the same type as the input"
},
{
"path": "core/src/main/scala/frameless/CatalystOrdered.scala",
"chars": 2050,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\nimport shapeless.{Generic, HList, Lazy}\nimport shapeless.ops"
},
{
"path": "core/src/main/scala/frameless/CatalystPivotable.scala",
"chars": 725,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n@implicitNotFound(\"Cannot pivot on type ${A}. Currently sup"
},
{
"path": "core/src/main/scala/frameless/CatalystRound.scala",
"chars": 926,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/** Spark does not return always long on round\n */\n@implic"
},
{
"path": "core/src/main/scala/frameless/CatalystSummable.scala",
"chars": 1168,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/**\n * When summing Spark doesn't change these types:\n * "
},
{
"path": "core/src/main/scala/frameless/CatalystVariance.scala",
"chars": 834,
"preview": "package frameless\n\nimport scala.annotation.implicitNotFound\n\n/**\n * Spark's variance and stddev functions always return"
},
{
"path": "core/src/main/scala/frameless/Injection.scala",
"chars": 406,
"preview": "package frameless\n\n/**\n * An Injection[A, B] is a reversible function from A to B.\n *\n * Must obey `forAll { a: A => inv"
},
{
"path": "core/src/main/scala/frameless/SQLDate.scala",
"chars": 392,
"preview": "package frameless\n\n/**\n * Type for the internal Spark representation of SQL date. If the `spark.sql.functions` where typ"
},
{
"path": "core/src/main/scala/frameless/SQLTimestamp.scala",
"chars": 377,
"preview": "package frameless\n\n/**\n * Type for the Spark internal representation of a timestamp. If the `spark.sql.functions` where "
},
{
"path": "dataset/src/main/scala/frameless/FramelessSyntax.scala",
"chars": 619,
"preview": "package frameless\n\nimport org.apache.spark.sql.{Column, DataFrame, Dataset}\n\ntrait FramelessSyntax {\n implicit class Co"
},
{
"path": "dataset/src/main/scala/frameless/InjectionEnum.scala",
"chars": 1280,
"preview": "package frameless\n\nimport shapeless._\n\ntrait InjectionEnum {\n implicit val cnilInjectionEnum: Injection[CNil, String] ="
},
{
"path": "dataset/src/main/scala/frameless/IsValueClass.scala",
"chars": 522,
"preview": "package frameless\n\nimport shapeless._\nimport shapeless.labelled.FieldType\n\n/** Evidence that `T` is a Value class */\n@an"
},
{
"path": "dataset/src/main/scala/frameless/Job.scala",
"chars": 1086,
"preview": "package frameless\n\nimport org.apache.spark.sql.SparkSession\n\nsealed abstract class Job[A](implicit spark: SparkSession) "
},
{
"path": "dataset/src/main/scala/frameless/RecordEncoder.scala",
"chars": 10360,
"preview": "package frameless\n\nimport org.apache.spark.sql.FramelessInternals\n\nimport org.apache.spark.sql.catalyst.expressions._\nim"
},
{
"path": "dataset/src/main/scala/frameless/SparkDelay.scala",
"chars": 148,
"preview": "package frameless\n\nimport org.apache.spark.sql.SparkSession\n\ntrait SparkDelay[F[_]] {\n def delay[A](a: => A)(implicit s"
},
{
"path": "dataset/src/main/scala/frameless/TypedColumn.scala",
"chars": 30244,
"preview": "package frameless\n\nimport frameless.functions.{litAggr, lit => flit}\nimport frameless.syntax._\n\nimport org.apache.spark."
},
{
"path": "dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala",
"chars": 2362,
"preview": "package frameless\n\nimport scala.reflect.macros.whitebox\n\nprivate[frameless] object TypedColumnMacroImpl {\n\n def applyIm"
},
{
"path": "dataset/src/main/scala/frameless/TypedDataset.scala",
"chars": 50829,
"preview": "package frameless\n\nimport java.util\nimport frameless.functions.CatalystExplodableCollection\nimport frameless.ops._\nimpor"
},
{
"path": "dataset/src/main/scala/frameless/TypedDatasetForwarded.scala",
"chars": 13566,
"preview": "package frameless\n\nimport java.util\n\nimport org.apache.spark.rdd.RDD\nimport org.apache.spark.sql.execution.QueryExecutio"
},
{
"path": "dataset/src/main/scala/frameless/TypedEncoder.scala",
"chars": 23476,
"preview": "package frameless\n\nimport java.math.BigInteger\n\nimport java.util.Date\n\nimport java.time.{ Duration, Instant, Period, Loc"
},
{
"path": "dataset/src/main/scala/frameless/TypedExpressionEncoder.scala",
"chars": 1466,
"preview": "package frameless\n\nimport org.apache.spark.sql.Encoder\nimport org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal\n"
},
{
"path": "dataset/src/main/scala/frameless/With.scala",
"chars": 796,
"preview": "package frameless\n\n/** Compute the intersection of two types:\n *\n * - With[A, A] = A\n * - With[A, B] = A with B (when"
},
{
"path": "dataset/src/main/scala/frameless/functions/AggregateFunctions.scala",
"chars": 10891,
"preview": "package frameless\npackage functions\n\nimport org.apache.spark.sql.FramelessInternals.expr\nimport org.apache.spark.sql.cat"
},
{
"path": "dataset/src/main/scala/frameless/functions/Lit.scala",
"chars": 2099,
"preview": "package frameless.functions\n\nimport org.apache.spark.sql.catalyst.InternalRow\nimport org.apache.spark.sql.catalyst.expre"
},
{
"path": "dataset/src/main/scala/frameless/functions/NonAggregateFunctions.scala",
"chars": 32941,
"preview": "package frameless\npackage functions\n\nimport org.apache.spark.sql.{Column, functions => sparkFunctions}\n\nimport scala.ann"
},
{
"path": "dataset/src/main/scala/frameless/functions/Udf.scala",
"chars": 8060,
"preview": "package frameless\npackage functions\n\nimport org.apache.spark.sql.catalyst.InternalRow\nimport org.apache.spark.sql.cataly"
},
{
"path": "dataset/src/main/scala/frameless/functions/UnaryFunctions.scala",
"chars": 3990,
"preview": "package frameless\npackage functions\n\nimport org.apache.spark.sql.{Column, functions => sparkFunctions}\n\nimport scala.mat"
},
{
"path": "dataset/src/main/scala/frameless/functions/package.scala",
"chars": 4180,
"preview": "package frameless\n\nimport scala.reflect.ClassTag\n\nimport shapeless._\nimport shapeless.labelled.FieldType\nimport shapeles"
},
{
"path": "dataset/src/main/scala/frameless/ops/AggregateTypes.scala",
"chars": 942,
"preview": "package frameless\npackage ops\n\nimport shapeless._\n\n/** A type class to extract the column types out of an HList of [[fra"
},
{
"path": "dataset/src/main/scala/frameless/ops/As.scala",
"chars": 873,
"preview": "package frameless\npackage ops\n\nimport shapeless.{::, Generic, HList, Lazy}\n\n/** Evidence for correctness of `TypedDatase"
},
{
"path": "dataset/src/main/scala/frameless/ops/ColumnTypes.scala",
"chars": 899,
"preview": "package frameless\npackage ops\n\nimport shapeless._\n\n/** A type class to extract the column types out of an HList of [[fra"
},
{
"path": "dataset/src/main/scala/frameless/ops/GroupByOps.scala",
"chars": 10891,
"preview": "package frameless\npackage ops\n\nimport org.apache.spark.sql.catalyst.analysis.UnresolvedAlias\nimport org.apache.spark.sql"
},
{
"path": "dataset/src/main/scala/frameless/ops/RelationalGroupsOps.scala",
"chars": 8306,
"preview": "package frameless\npackage ops\n\nimport org.apache.spark.sql.{Column, Dataset, RelationalGroupedDataset}\nimport shapeless."
},
{
"path": "dataset/src/main/scala/frameless/ops/Repeat.scala",
"chars": 914,
"preview": "package frameless\npackage ops\n\nimport shapeless.{HList, Nat, Succ}\nimport shapeless.ops.hlist.Prepend\n\n/** Typeclass sup"
},
{
"path": "dataset/src/main/scala/frameless/ops/SmartProject.scala",
"chars": 2248,
"preview": "package frameless\npackage ops\n\nimport shapeless.ops.hlist.ToTraversable\nimport shapeless.ops.record.{Keys, SelectAll, Va"
},
{
"path": "dataset/src/main/scala/frameless/syntax/package.scala",
"chars": 151,
"preview": "package frameless\n\npackage object syntax extends FramelessSyntax {\n implicit val DefaultSparkDelay: SparkDelay[Job] = J"
},
{
"path": "dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala",
"chars": 3543,
"preview": "package org.apache.spark.sql\n\nimport org.apache.spark.sql.catalyst.expressions._\nimport org.apache.spark.sql.catalyst.ex"
},
{
"path": "dataset/src/main/scala/org/apache/spark/sql/reflection/package.scala",
"chars": 3096,
"preview": "package org.apache.spark.sql\n\nimport org.apache.spark.sql.catalyst.ScalaReflection.{\n cleanUpReflectionObjects,\n getCl"
},
{
"path": "dataset/src/main/spark-3/frameless/MapGroups.scala",
"chars": 503,
"preview": "package frameless\n\nimport org.apache.spark.sql.Encoder\nimport org.apache.spark.sql.catalyst.expressions.Attribute\nimport"
},
{
"path": "dataset/src/main/spark-3.4+/frameless/MapGroups.scala",
"chars": 575,
"preview": "package frameless\n\nimport org.apache.spark.sql.Encoder\nimport org.apache.spark.sql.catalyst.expressions.Attribute\nimport"
},
{
"path": "dataset/src/test/resources/log4j.properties",
"chars": 9402,
"preview": "log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR\nlog4j.logger.akka.event.slf4j=ERROR\nlog4j.logger.akka.remote.EndpointWri"
},
{
"path": "dataset/src/test/resources/log4j2.properties",
"chars": 805,
"preview": "# Set to debug or trace if log4j initialization is failing\nstatus = warn\n\n# Name of the configuration\nname = ConsoleAppe"
},
{
"path": "dataset/src/test/scala/frameless/AsTests.scala",
"chars": 1621,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass AsTests extends TypedDatasetSuite {\n "
},
{
"path": "dataset/src/test/scala/frameless/BitwiseTests.scala",
"chars": 5017,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport org.scalatest.matchers.should.Matchers"
},
{
"path": "dataset/src/test/scala/frameless/CastTests.scala",
"chars": 3740,
"preview": "package frameless\n\nimport org.scalacheck.{Arbitrary, Gen, Prop}\nimport org.scalacheck.Prop._\n\nclass CastTests extends Ty"
},
{
"path": "dataset/src/test/scala/frameless/ColTests.scala",
"chars": 1414,
"preview": "package frameless\n\nimport shapeless.test.illTyped\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass ColTes"
},
{
"path": "dataset/src/test/scala/frameless/CollectTests.scala",
"chars": 3450,
"preview": "package frameless\n\nimport frameless.CollectTests.{ prop, propArray }\nimport org.apache.spark.sql.SparkSession\nimport org"
},
{
"path": "dataset/src/test/scala/frameless/ColumnTests.scala",
"chars": 17262,
"preview": "package frameless\n\nimport java.util.Date\nimport java.math.BigInteger\n\nimport java.time.{ Instant, LocalDate, Period, Dur"
},
{
"path": "dataset/src/test/scala/frameless/ColumnViaLambdaTests.scala",
"chars": 2115,
"preview": "package frameless\n\nimport org.scalatest.matchers.should.Matchers\nimport shapeless.test.illTyped\n\ncase class MyClass1(a: "
},
{
"path": "dataset/src/test/scala/frameless/CreateTests.scala",
"chars": 5487,
"preview": "package frameless\n\nimport org.scalacheck.{Arbitrary, Prop}\nimport org.scalacheck.Prop._\n\nimport scala.reflect.ClassTag\ni"
},
{
"path": "dataset/src/test/scala/frameless/DropTest.scala",
"chars": 1962,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\n\nclass DropTes"
},
{
"path": "dataset/src/test/scala/frameless/DropTupledTest.scala",
"chars": 1824,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass DropTupledTest extends TypedDatasetSui"
},
{
"path": "dataset/src/test/scala/frameless/EncoderTests.scala",
"chars": 885,
"preview": "package frameless\n\nimport scala.collection.immutable.Set\n\nimport org.scalatest.matchers.should.Matchers\n\nobject EncoderT"
},
{
"path": "dataset/src/test/scala/frameless/ExplodeTests.scala",
"chars": 3305,
"preview": "package frameless\n\nimport frameless.functions.CatalystExplodableCollection\nimport org.scalacheck.{Arbitrary, Prop}\nimpor"
},
{
"path": "dataset/src/test/scala/frameless/FilterTests.scala",
"chars": 6589,
"preview": "package frameless\n\nimport org.scalatest.matchers.should.Matchers\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop."
},
{
"path": "dataset/src/test/scala/frameless/FlattenTests.scala",
"chars": 843,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.forAll\nimport org.scalacheck.Prop._\n\n\nclass Fla"
},
{
"path": "dataset/src/test/scala/frameless/GroupByTests.scala",
"chars": 14552,
"preview": "package frameless\n\nimport frameless.functions.aggregate._\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass"
},
{
"path": "dataset/src/test/scala/frameless/InjectionTests.scala",
"chars": 9191,
"preview": "package frameless\n\nimport frameless.CollectTests.prop\nimport org.scalacheck._\nimport org.scalacheck.Prop._\nimport shapel"
},
{
"path": "dataset/src/test/scala/frameless/IsValueClassTests.scala",
"chars": 1054,
"preview": "package frameless\n\nimport shapeless.Refute\nimport shapeless.test.illTyped\n\nimport org.scalatest.funsuite.AnyFunSuite\nimp"
},
{
"path": "dataset/src/test/scala/frameless/JobTests.scala",
"chars": 1513,
"preview": "package frameless\n\nimport org.scalacheck.Arbitrary\nimport org.scalatest.BeforeAndAfterAll\nimport org.scalatestplus.scala"
},
{
"path": "dataset/src/test/scala/frameless/JoinTests.scala",
"chars": 6983,
"preview": "package frameless\n\nimport org.apache.spark.sql.types.{StructField, StructType}\nimport org.scalacheck.Prop\nimport org.sca"
},
{
"path": "dataset/src/test/scala/frameless/LitTests.scala",
"chars": 3034,
"preview": "package frameless\n\nimport frameless.functions.lit\n\nimport org.scalatest.matchers.should.Matchers\n\nimport org.scalacheck."
},
{
"path": "dataset/src/test/scala/frameless/NumericTests.scala",
"chars": 6085,
"preview": "package frameless\n\nimport org.apache.spark.sql.Encoder\nimport org.scalacheck.{Arbitrary, Gen, Prop}\nimport org.scalachec"
},
{
"path": "dataset/src/test/scala/frameless/OrderByTests.scala",
"chars": 8547,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\nimport org.apa"
},
{
"path": "dataset/src/test/scala/frameless/RecordEncoderTests.scala",
"chars": 16000,
"preview": "package frameless\n\nimport org.apache.spark.sql.{Row, functions => F}\nimport org.apache.spark.sql.types.{\n ArrayType,\n "
},
{
"path": "dataset/src/test/scala/frameless/SchemaTests.scala",
"chars": 1530,
"preview": "package frameless\n\nimport frameless.functions.aggregate._\nimport frameless.functions._\nimport org.apache.spark.sql.types"
},
{
"path": "dataset/src/test/scala/frameless/SelectTests.scala",
"chars": 12264,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\nimport scala.r"
},
{
"path": "dataset/src/test/scala/frameless/SelfJoinTests.scala",
"chars": 6157,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport org.apache.spark.sql.{SparkSession, fu"
},
{
"path": "dataset/src/test/scala/frameless/TypedDatasetSuite.scala",
"chars": 3191,
"preview": "package frameless\n\nimport com.globalmentor.apache.hadoop.fs.BareLocalFileSystem\nimport org.apache.hadoop.fs.local.Stream"
},
{
"path": "dataset/src/test/scala/frameless/UdtEncodedClass.scala",
"chars": 1512,
"preview": "package frameless\n\nimport org.apache.spark.sql.catalyst.InternalRow\nimport org.apache.spark.sql.catalyst.expressions.{Ge"
},
{
"path": "dataset/src/test/scala/frameless/WithColumnTest.scala",
"chars": 2498,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\n\nclass WithCol"
},
{
"path": "dataset/src/test/scala/frameless/WithColumnTupledTest.scala",
"chars": 711,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass WithColumnTupledTest extends TypedData"
},
{
"path": "dataset/src/test/scala/frameless/XN.scala",
"chars": 4625,
"preview": "package frameless\n\nimport org.scalacheck.{Arbitrary, Cogen}\n\ncase class X1[A](a: A)\n\nobject X1 {\n implicit def arbitrar"
},
{
"path": "dataset/src/test/scala/frameless/forward/CheckpointTests.scala",
"chars": 573,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.{forAll, _}\n\n\nclass CheckpointTests extends Typ"
},
{
"path": "dataset/src/test/scala/frameless/forward/ColumnsTests.scala",
"chars": 875,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.forAll\n\nclass ColumnsTests extends TypedDataset"
},
{
"path": "dataset/src/test/scala/frameless/forward/CountTests.scala",
"chars": 332,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass CountTests extends TypedDatasetSuite {"
},
{
"path": "dataset/src/test/scala/frameless/forward/DistinctTests.scala",
"chars": 499,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport math.Ordering\n\nclass DistinctTests ext"
},
{
"path": "dataset/src/test/scala/frameless/forward/ExceptTests.scala",
"chars": 641,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass ExceptTests extends TypedDatasetSuite "
},
{
"path": "dataset/src/test/scala/frameless/forward/FirstTests.scala",
"chars": 529,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport org.scalatest.matchers.should.Matchers"
},
{
"path": "dataset/src/test/scala/frameless/forward/ForeachTests.scala",
"chars": 963,
"preview": "package frameless\npackage forward\n\nimport org.apache.spark.util.CollectionAccumulator\n\nimport org.scalacheck.Prop\nimport"
},
{
"path": "dataset/src/test/scala/frameless/forward/HeadTests.scala",
"chars": 1035,
"preview": "package frameless.forward\n\nimport frameless.{TypedDataset, TypedDatasetSuite, TypedEncoder, TypedExpressionEncoder, X1}\n"
},
{
"path": "dataset/src/test/scala/frameless/forward/InputFilesTests.scala",
"chars": 1630,
"preview": "package frameless\n\nimport java.util.UUID\n\nimport org.apache.spark.sql.SparkSession\nimport org.scalacheck.Prop\nimport org"
},
{
"path": "dataset/src/test/scala/frameless/forward/IntersectTests.scala",
"chars": 829,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport math.Ordering\n\nclass IntersectTests ex"
},
{
"path": "dataset/src/test/scala/frameless/forward/IsLocalTests.scala",
"chars": 373,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass IsLocalTests extends TypedDatasetSuite"
},
{
"path": "dataset/src/test/scala/frameless/forward/IsStreamingTests.scala",
"chars": 389,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass IsStreamingTests extends TypedDatasetS"
},
{
"path": "dataset/src/test/scala/frameless/forward/LimitTests.scala",
"chars": 481,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass LimitTests extends TypedDatasetSuite {"
},
{
"path": "dataset/src/test/scala/frameless/forward/QueryExecutionTests.scala",
"chars": 414,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.{forAll, _}\n\nclass QueryExecutionTests extends "
},
{
"path": "dataset/src/test/scala/frameless/forward/RandomSplitTests.scala",
"chars": 1342,
"preview": "package frameless\n\nimport org.scalacheck.Arbitrary.arbitrary\nimport org.scalacheck.Prop._\nimport org.scalacheck.{Arbitra"
},
{
"path": "dataset/src/test/scala/frameless/forward/SQLContextTests.scala",
"chars": 399,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.{forAll, _}\n\nclass SQLContextTests extends Type"
},
{
"path": "dataset/src/test/scala/frameless/forward/SparkSessionTests.scala",
"chars": 396,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass SparkSessionTests extends TypedDataset"
},
{
"path": "dataset/src/test/scala/frameless/forward/StorageLevelTests.scala",
"chars": 935,
"preview": "package frameless\n\nimport org.apache.spark.storage.StorageLevel\nimport org.apache.spark.storage.StorageLevel._\nimport or"
},
{
"path": "dataset/src/test/scala/frameless/forward/TakeTests.scala",
"chars": 794,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport scala.reflect.ClassTag\n\nclass TakeTest"
},
{
"path": "dataset/src/test/scala/frameless/forward/ToJSONTests.scala",
"chars": 395,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass ToJSONTests extends TypedDatasetSuite "
},
{
"path": "dataset/src/test/scala/frameless/forward/ToLocalIteratorTests.scala",
"chars": 565,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport scala.collection.JavaConverters._\nimpo"
},
{
"path": "dataset/src/test/scala/frameless/forward/UnionTests.scala",
"chars": 2328,
"preview": "package frameless\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\n\nclass UnionTe"
},
{
"path": "dataset/src/test/scala/frameless/forward/WriteStreamTests.scala",
"chars": 3395,
"preview": "package frameless\n\nimport java.util.UUID\n\nimport org.apache.spark.sql.Encoder\nimport org.apache.spark.sql.execution.stre"
},
{
"path": "dataset/src/test/scala/frameless/forward/WriteTests.scala",
"chars": 1880,
"preview": "package frameless\n\nimport java.util.UUID\n\nimport org.scalacheck.Prop._\nimport org.scalacheck.{Arbitrary, Gen, Prop}\n\ncla"
},
{
"path": "dataset/src/test/scala/frameless/functions/AggregateFunctionsTests.scala",
"chars": 19781,
"preview": "package frameless\npackage functions\n\nimport frameless.{TypedAggregate, TypedColumn}\nimport frameless.functions.aggregate"
},
{
"path": "dataset/src/test/scala/frameless/functions/DateTimeStringBehaviourUtils.scala",
"chars": 209,
"preview": "package frameless.functions\n\nimport org.apache.spark.sql.Row\n\nobject DateTimeStringBehaviourUtils {\n val nullHandler: R"
},
{
"path": "dataset/src/test/scala/frameless/functions/DoubleBehaviourUtils.scala",
"chars": 945,
"preview": "package frameless\npackage functions\n\n/**\n * Some statistical functions in Spark can result in Double, Double.NaN or Nul"
},
{
"path": "dataset/src/test/scala/frameless/functions/NonAggregateFunctionsTests.scala",
"chars": 63901,
"preview": "package frameless\npackage functions\n\nimport java.io.File\nimport java.util.Base64\nimport java.nio.charset.StandardCharset"
},
{
"path": "dataset/src/test/scala/frameless/functions/UdfTests.scala",
"chars": 6583,
"preview": "package frameless\npackage functions\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass UdfTests extends Typ"
},
{
"path": "dataset/src/test/scala/frameless/functions/UnaryFunctionsTest.scala",
"chars": 4497,
"preview": "package frameless\npackage functions\n\nimport org.scalacheck.{ Arbitrary, Prop }\nimport org.scalacheck.Prop._\nimport scala"
},
{
"path": "dataset/src/test/scala/frameless/ops/ColumnTypesTest.scala",
"chars": 900,
"preview": "package frameless\npackage ops\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop.forAll\nimport shapeless.HNil\nimport"
},
{
"path": "dataset/src/test/scala/frameless/ops/CubeTests.scala",
"chars": 12439,
"preview": "package frameless\npackage ops\n\nimport frameless.functions.aggregate._\nimport org.scalacheck.Prop\nimport org.scalacheck.P"
},
{
"path": "dataset/src/test/scala/frameless/ops/PivotTest.scala",
"chars": 4025,
"preview": "package frameless\npackage ops\n\nimport frameless.functions.aggregate._\nimport org.apache.spark.sql.{functions => sparkFun"
},
{
"path": "dataset/src/test/scala/frameless/ops/RepeatTest.scala",
"chars": 676,
"preview": "package frameless\npackage ops\n\nimport shapeless.test.illTyped\nimport shapeless.{::, HNil, Nat}\n\nclass RepeatTest extends"
},
{
"path": "dataset/src/test/scala/frameless/ops/RollupTests.scala",
"chars": 12508,
"preview": "package frameless\npackage ops\n\nimport frameless.functions.aggregate._\nimport org.scalacheck.Prop\nimport org.scalacheck.P"
},
{
"path": "dataset/src/test/scala/frameless/ops/SmartProjectTest.scala",
"chars": 2506,
"preview": "package frameless\npackage ops\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport shapeless.test.illTyped\n\n\n"
},
{
"path": "dataset/src/test/scala/frameless/ops/deserialized/FilterTests.scala",
"chars": 482,
"preview": "package frameless\npackage ops\npackage deserialized\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass Filte"
},
{
"path": "dataset/src/test/scala/frameless/ops/deserialized/FlatMapTests.scala",
"chars": 557,
"preview": "package frameless\npackage ops\npackage deserialized\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass FlatM"
},
{
"path": "dataset/src/test/scala/frameless/ops/deserialized/MapPartitionsTests.scala",
"chars": 620,
"preview": "package frameless\npackage ops\npackage deserialized\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass MapPa"
},
{
"path": "dataset/src/test/scala/frameless/ops/deserialized/MapTests.scala",
"chars": 565,
"preview": "package frameless\npackage ops\npackage deserialized\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass MapTe"
},
{
"path": "dataset/src/test/scala/frameless/ops/deserialized/ReduceTests.scala",
"chars": 612,
"preview": "package frameless\npackage ops\npackage deserialized\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\n\nclass Reduc"
},
{
"path": "dataset/src/test/scala/frameless/package.scala",
"chars": 4195,
"preview": "import java.time.format.DateTimeFormatter\nimport java.time.{LocalDateTime => JavaLocalDateTime}\n\nimport org.scalacheck.{"
},
{
"path": "dataset/src/test/scala/frameless/sql/package.scala",
"chars": 569,
"preview": "package frameless\n\nimport org.apache.spark.sql.catalyst.expressions.Expression\nimport org.apache.spark.sql.catalyst.expr"
},
{
"path": "dataset/src/test/scala/frameless/sql/rules/SQLRulesSuite.scala",
"chars": 2659,
"preview": "package frameless.sql.rules\n\nimport frameless._\nimport frameless.sql._\nimport org.apache.spark.sql.catalyst.expressions."
},
{
"path": "dataset/src/test/scala/frameless/syntax/FramelessSyntaxTests.scala",
"chars": 1594,
"preview": "package frameless\npackage syntax\n\nimport org.scalacheck.Prop\nimport org.scalacheck.Prop._\nimport frameless.functions.agg"
},
{
"path": "dataset/src/test/scala/org/apache/hadoop/fs/local/StreamingFS.scala",
"chars": 315,
"preview": "package org.apache.hadoop.fs.local\n\nimport com.globalmentor.apache.hadoop.fs.BareLocalFileSystem\nimport org.apache.hadoo"
},
{
"path": "dataset/src/test/spark-3.2/frameless/sql/rules/FramelessLitPushDownTests.scala",
"chars": 2992,
"preview": "package frameless.sql.rules\n\nimport frameless._\nimport frameless.sql._\nimport frameless.functions.Lit\nimport org.apache."
},
{
"path": "dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala",
"chars": 1900,
"preview": "package frameless.sql.rules\n\nimport frameless._\nimport frameless.functions.Lit\nimport org.apache.spark.sql.catalyst.util"
},
{
"path": "docs/Cats.md",
"chars": 6556,
"preview": "# Using Cats with Frameless\n\n```scala mdoc:invisible\nimport org.apache.spark.{SparkConf, SparkContext => SC}\nimport org."
},
{
"path": "docs/FeatureOverview.md",
"chars": 20768,
"preview": "# TypedDataset: Feature Overview\n\nThis tutorial introduces `TypedDataset` using a simple example.\nThe following imports "
},
{
"path": "docs/Injection.md",
"chars": 4082,
"preview": "# Injection: Creating Custom Encoders\n\n```scala mdoc:invisible:reset-object\nimport org.apache.spark.{SparkConf, SparkCon"
},
{
"path": "docs/Job.md",
"chars": 3195,
"preview": "# Job\\[A\\]\n\nAll operations on `TypedDataset` are lazy. An operation either returns a new\ntransformed `TypedDataset` or a"
},
{
"path": "docs/TypedDataFrame.md",
"chars": 6872,
"preview": "# Proof of Concept: TypedDataFrame\n\n`TypedDataFrame` is the API developed in the early stages of Frameless to manipulate"
},
{
"path": "docs/TypedDatasetVsSparkDataset.md",
"chars": 7231,
"preview": "# Comparing TypedDatasets with Spark's Datasets\n\n```scala mdoc:invisible:reset-object\nimport org.apache.spark.SparkConf\n"
},
{
"path": "docs/TypedEncoder.md",
"chars": 3314,
"preview": "# Typed Encoders in Frameless\n\n```scala mdoc:invisible:reset-object\nimport org.apache.spark.{SparkConf, SparkContext}\nim"
},
{
"path": "docs/TypedML.md",
"chars": 11513,
"preview": "# Typed Spark ML\n\nThe `frameless-ml` module provides a strongly typed Spark ML API leveraging `TypedDataset`s. It introd"
},
{
"path": "docs/WorkingWithCsvParquetJson.md",
"chars": 4147,
"preview": "# Working with CSV and Parquet data\n\n```scala mdoc:invisible:reset-object\nimport org.apache.spark.{SparkConf, SparkConte"
},
{
"path": "docs/directory.conf",
"chars": 214,
"preview": "laika.title = frameless\nlaika.navigationOrder = [\n README.md\n FeatureOverview.md\n TypedDatasetVsSparkDataset.md\n Wor"
},
{
"path": "docs/iris.data",
"chars": 4551,
"preview": "5.1,3.5,1.4,0.2,Iris-setosa\n4.9,3.0,1.4,0.2,Iris-setosa\n4.7,3.2,1.3,0.2,Iris-setosa\n4.6,3.1,1.5,0.2,Iris-setosa\n5.0,3.6,"
},
{
"path": "github.sbt",
"chars": 887,
"preview": "ThisBuild / githubWorkflowArtifactUpload := false // doesn't work with scoverage\n\nThisBuild / githubWorkflowEnv += \"SPAR"
},
{
"path": "ml/src/main/scala/frameless/ml/TypedEstimator.scala",
"chars": 683,
"preview": "package frameless\npackage ml\n\nimport frameless.ops.SmartProject\nimport org.apache.spark.ml.{Estimator, Model}\n\n/**\n * A"
},
{
"path": "ml/src/main/scala/frameless/ml/TypedTransformer.scala",
"chars": 1494,
"preview": "package frameless\npackage ml\n\nimport frameless.ops.SmartProject\nimport org.apache.spark.ml.Transformer\nimport shapeless."
},
{
"path": "ml/src/main/scala/frameless/ml/classification/TypedRandomForestClassifier.scala",
"chars": 2485,
"preview": "package frameless\npackage ml\npackage classification\n\nimport frameless.ml.internals.TreesInputsChecker\nimport frameless.m"
},
{
"path": "ml/src/main/scala/frameless/ml/clustering/TypedBisectingKMeans.scala",
"chars": 2176,
"preview": "package frameless\npackage ml\npackage classification\n\nimport frameless.ml.internals.VectorInputsChecker\nimport org.apache"
},
{
"path": "ml/src/main/scala/frameless/ml/clustering/TypedKMeans.scala",
"chars": 1511,
"preview": "package frameless\npackage ml\npackage classification\n\nimport frameless.ml.internals.VectorInputsChecker\nimport frameless."
},
{
"path": "ml/src/main/scala/frameless/ml/feature/TypedIndexToString.scala",
"chars": 1076,
"preview": "package frameless\npackage ml\npackage feature\n\nimport frameless.ml.internals.UnaryInputsChecker\nimport org.apache.spark.m"
},
{
"path": "ml/src/main/scala/frameless/ml/feature/TypedStringIndexer.scala",
"chars": 1612,
"preview": "package frameless\npackage ml\npackage feature\n\nimport frameless.ml.feature.TypedStringIndexer.HandleInvalid\nimport framel"
},
{
"path": "ml/src/main/scala/frameless/ml/feature/TypedVectorAssembler.scala",
"chars": 2680,
"preview": "package frameless\npackage ml\npackage feature\n\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark"
},
{
"path": "ml/src/main/scala/frameless/ml/internals/LinearInputsChecker.scala",
"chars": 2176,
"preview": "package frameless\npackage ml\npackage internals\n\nimport org.apache.spark.ml.linalg._\nimport shapeless.ops.hlist.Length\nim"
},
{
"path": "ml/src/main/scala/frameless/ml/internals/SelectorByValue.scala",
"chars": 991,
"preview": "package frameless\npackage ml\npackage internals\n\nimport shapeless.labelled.FieldType\nimport shapeless.{::, DepFn1, HList,"
},
{
"path": "ml/src/main/scala/frameless/ml/internals/TreesInputsChecker.scala",
"chars": 1299,
"preview": "package frameless\npackage ml\npackage internals\n\nimport shapeless.ops.hlist.Length\nimport shapeless.{HList, LabelledGener"
},
{
"path": "ml/src/main/scala/frameless/ml/internals/UnaryInputsChecker.scala",
"chars": 936,
"preview": "package frameless\npackage ml\npackage internals\n\nimport shapeless.ops.hlist.Length\nimport shapeless.{HList, LabelledGener"
},
{
"path": "ml/src/main/scala/frameless/ml/internals/VectorInputsChecker.scala",
"chars": 984,
"preview": "package frameless\npackage ml\npackage internals\n\nimport shapeless.ops.hlist.Length\nimport shapeless.{HList, LabelledGener"
},
{
"path": "ml/src/main/scala/frameless/ml/package.scala",
"chars": 371,
"preview": "package frameless\n\nimport org.apache.spark.sql.FramelessInternals.UserDefinedType\nimport org.apache.spark.ml.FramelessIn"
},
{
"path": "ml/src/main/scala/frameless/ml/params/kmeans/KMeansInitMode.scala",
"chars": 552,
"preview": "package frameless\npackage ml\npackage params\npackage kmeans\n\n/**\n * Param for the initialization algorithm.\n * This can"
},
{
"path": "ml/src/main/scala/frameless/ml/params/linears/LossStrategy.scala",
"chars": 668,
"preview": "package frameless\npackage ml\npackage params\npackage linears\n/**\n * <a href=\"https://en.wikipedia.org/wiki/Mean_squared_"
},
{
"path": "ml/src/main/scala/frameless/ml/params/linears/Solver.scala",
"chars": 895,
"preview": "package frameless\npackage ml\npackage params\npackage linears\n\n/**\n * solver algorithm used for optimization.\n * - \"l-b"
},
{
"path": "ml/src/main/scala/frameless/ml/params/trees/FeatureSubsetStrategy.scala",
"chars": 1739,
"preview": "package frameless\npackage ml\npackage params\npackage trees\n/**\n * The number of features to consider for splits at each "
},
{
"path": "ml/src/main/scala/frameless/ml/regression/TypedLinearRegression.scala",
"chars": 2649,
"preview": "package frameless\npackage ml\npackage regression\n\nimport frameless.ml.internals.LinearInputsChecker\nimport frameless.ml.p"
},
{
"path": "ml/src/main/scala/frameless/ml/regression/TypedRandomForestRegressor.scala",
"chars": 2189,
"preview": "package frameless\npackage ml\npackage regression\n\nimport frameless.ml.internals.TreesInputsChecker\nimport frameless.ml.pa"
},
{
"path": "ml/src/main/scala/org/apache/spark/ml/FramelessInternals.scala",
"chars": 320,
"preview": "package org.apache.spark.ml\n\nimport org.apache.spark.ml.linalg.{MatrixUDT, VectorUDT}\n\nobject FramelessInternals {\n\n //"
},
{
"path": "ml/src/test/scala/frameless/ml/FramelessMlSuite.scala",
"chars": 575,
"preview": "package frameless\npackage ml\n\nimport org.scalactic.anyvals.PosZInt\nimport org.scalatest.BeforeAndAfterAll\nimport org.sca"
},
{
"path": "ml/src/test/scala/frameless/ml/Generators.scala",
"chars": 1803,
"preview": "package frameless\npackage ml\n\nimport frameless.ml.params.linears.{LossStrategy, Solver}\nimport frameless.ml.params.trees"
},
{
"path": "ml/src/test/scala/frameless/ml/TypedEncoderInstancesTests.scala",
"chars": 1570,
"preview": "package frameless\npackage ml\n\nimport org.scalacheck.Prop._\nimport org.apache.spark.ml.linalg._\nimport org.apache.spark.m"
},
{
"path": "ml/src/test/scala/frameless/ml/classification/ClassificationIntegrationTests.scala",
"chars": 2736,
"preview": "package frameless\npackage ml\npackage classification\n\nimport frameless.ml.feature.{TypedIndexToString, TypedStringIndexer"
},
{
"path": "ml/src/test/scala/frameless/ml/classification/TypedRandomForestClassifierTests.scala",
"chars": 3459,
"preview": "package frameless\npackage ml\npackage classification\n\nimport shapeless.test.illTyped\nimport org.apache.spark.ml.linalg._\n"
},
{
"path": "ml/src/test/scala/frameless/ml/clustering/BisectingKMeansTests.scala",
"chars": 1724,
"preview": "package frameless\npackage ml\npackage clustering\n\nimport frameless.{TypedDataset, TypedEncoder, X1, X2, X3}\nimport framel"
},
{
"path": "ml/src/test/scala/frameless/ml/clustering/ClusteringIntegrationTests.scala",
"chars": 2234,
"preview": "package frameless\npackage ml\npackage clustering\n\nimport frameless.ml.FramelessMlSuite\nimport frameless.ml.classification"
},
{
"path": "ml/src/test/scala/frameless/ml/clustering/KMeansTests.scala",
"chars": 2737,
"preview": "package frameless\npackage ml\npackage clustering\n\nimport frameless.ml.classification.TypedKMeans\nimport frameless.{TypedD"
},
{
"path": "ml/src/test/scala/frameless/ml/feature/TypedIndexToStringTests.scala",
"chars": 1093,
"preview": "package frameless\npackage ml\npackage feature\n\nimport org.scalacheck.{Arbitrary, Gen}\nimport org.scalacheck.Prop._\nimport"
},
{
"path": "ml/src/test/scala/frameless/ml/feature/TypedStringIndexerTests.scala",
"chars": 1539,
"preview": "package frameless\npackage ml\npackage feature\n\nimport frameless.ml.feature.TypedStringIndexer.HandleInvalid\nimport org.sc"
},
{
"path": "ml/src/test/scala/frameless/ml/feature/TypedVectorAssemblerTests.scala",
"chars": 1648,
"preview": "package frameless\npackage ml\npackage feature\n\nimport org.scalacheck.Arbitrary\nimport org.scalacheck.Prop._\nimport org.ap"
},
{
"path": "ml/src/test/scala/frameless/ml/regression/RegressionIntegrationTests.scala",
"chars": 1487,
"preview": "package frameless\npackage ml\npackage regression\n\nimport frameless.ml.feature.TypedVectorAssembler\nimport org.apache.spar"
},
{
"path": "ml/src/test/scala/frameless/ml/regression/TypedLinearRegressionTests.scala",
"chars": 4293,
"preview": "package frameless\npackage ml\npackage regression\n\nimport frameless.ml.params.linears.{LossStrategy, Solver}\nimport org.ap"
},
{
"path": "ml/src/test/scala/frameless/ml/regression/TypedRandomForestRegressorTests.scala",
"chars": 3241,
"preview": "package frameless\npackage ml\npackage regression\n\nimport frameless.ml.params.trees.FeatureSubsetStrategy\nimport shapeless"
},
{
"path": "project/Common.scala",
"chars": 312,
"preview": "import sbt.Keys._\nimport sbt._\nimport sbt.plugins.JvmPlugin\n\nimport org.scalafmt.sbt.ScalafmtPlugin.autoImport._\n\nobject"
},
{
"path": "project/build.properties",
"chars": 20,
"preview": "sbt.version=1.12.11\n"
},
{
"path": "project/plugins.sbt",
"chars": 369,
"preview": "val sbtTypelevelVersion = \"0.8.5\"\n\naddSbtPlugin(\"org.typelevel\" % \"sbt-typelevel-ci-release\" % sbtTypelevelVersion)\n\nadd"
},
{
"path": "refined/src/main/scala/frameless/refined/RefinedFieldEncoders.scala",
"chars": 2426,
"preview": "package frameless.refined\n\nimport scala.reflect.ClassTag\n\nimport org.apache.spark.sql.catalyst.expressions._\nimport org."
}
]
// ... and 6 more files (download for full content)
About this extraction
This page contains the full source code of the adelbertc/frameless GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 206 files (773.8 KB), approximately 227.1k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.