Repository: adelbertc/frameless Branch: master Commit: 6826375be4c0 Files: 206 Total size: 773.8 KB Directory structure: gitextract_u5s1eutc/ ├── .github/ │ ├── release-drafter.yml │ └── workflows/ │ ├── ci.yml │ ├── clean.yml │ └── release-drafter.yml ├── .gitignore ├── .scalafmt.conf ├── LICENSE ├── README.md ├── build.sbt ├── cats/ │ └── src/ │ ├── main/ │ │ └── scala/ │ │ └── frameless/ │ │ └── cats/ │ │ ├── FramelessSyntax.scala │ │ ├── SparkDelayInstances.scala │ │ ├── SparkTask.scala │ │ ├── implicits.scala │ │ └── package.scala │ └── test/ │ ├── resources/ │ │ ├── log4j.properties │ │ └── log4j2.properties │ └── scala/ │ └── frameless/ │ └── cats/ │ ├── FramelessSyntaxTests.scala │ └── test.scala ├── core/ │ └── src/ │ └── main/ │ └── scala/ │ └── frameless/ │ ├── CatalystAverageable.scala │ ├── CatalystBitShift.scala │ ├── CatalystBitwise.scala │ ├── CatalystCast.scala │ ├── CatalystCollection.scala │ ├── CatalystDivisible.scala │ ├── CatalystIsin.scala │ ├── CatalystNaN.scala │ ├── CatalystNotNullable.scala │ ├── CatalystNumeric.scala │ ├── CatalystNumericWithJavaBigDecimal.scala │ ├── CatalystOrdered.scala │ ├── CatalystPivotable.scala │ ├── CatalystRound.scala │ ├── CatalystSummable.scala │ ├── CatalystVariance.scala │ ├── Injection.scala │ ├── SQLDate.scala │ └── SQLTimestamp.scala ├── dataset/ │ └── src/ │ ├── main/ │ │ ├── scala/ │ │ │ ├── frameless/ │ │ │ │ ├── FramelessSyntax.scala │ │ │ │ ├── InjectionEnum.scala │ │ │ │ ├── IsValueClass.scala │ │ │ │ ├── Job.scala │ │ │ │ ├── RecordEncoder.scala │ │ │ │ ├── SparkDelay.scala │ │ │ │ ├── TypedColumn.scala │ │ │ │ ├── TypedColumnMacroImpl.scala │ │ │ │ ├── TypedDataset.scala │ │ │ │ ├── TypedDatasetForwarded.scala │ │ │ │ ├── TypedEncoder.scala │ │ │ │ ├── TypedExpressionEncoder.scala │ │ │ │ ├── With.scala │ │ │ │ ├── functions/ │ │ │ │ │ ├── AggregateFunctions.scala │ │ │ │ │ ├── Lit.scala │ │ │ │ │ ├── NonAggregateFunctions.scala │ │ │ │ │ ├── Udf.scala │ │ │ │ │ ├── UnaryFunctions.scala │ │ │ │ │ └── package.scala │ │ │ │ ├── ops/ │ │ │ │ │ ├── AggregateTypes.scala │ │ │ │ │ ├── As.scala │ │ │ │ │ ├── ColumnTypes.scala │ │ │ │ │ ├── GroupByOps.scala │ │ │ │ │ ├── RelationalGroupsOps.scala │ │ │ │ │ ├── Repeat.scala │ │ │ │ │ └── SmartProject.scala │ │ │ │ └── syntax/ │ │ │ │ └── package.scala │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── spark/ │ │ │ └── sql/ │ │ │ ├── FramelessInternals.scala │ │ │ └── reflection/ │ │ │ └── package.scala │ │ ├── spark-3/ │ │ │ └── frameless/ │ │ │ └── MapGroups.scala │ │ └── spark-3.4+/ │ │ └── frameless/ │ │ └── MapGroups.scala │ └── test/ │ ├── resources/ │ │ ├── log4j.properties │ │ └── log4j2.properties │ ├── scala/ │ │ ├── frameless/ │ │ │ ├── AsTests.scala │ │ │ ├── BitwiseTests.scala │ │ │ ├── CastTests.scala │ │ │ ├── ColTests.scala │ │ │ ├── CollectTests.scala │ │ │ ├── ColumnTests.scala │ │ │ ├── ColumnViaLambdaTests.scala │ │ │ ├── CreateTests.scala │ │ │ ├── DropTest.scala │ │ │ ├── DropTupledTest.scala │ │ │ ├── EncoderTests.scala │ │ │ ├── ExplodeTests.scala │ │ │ ├── FilterTests.scala │ │ │ ├── FlattenTests.scala │ │ │ ├── GroupByTests.scala │ │ │ ├── InjectionTests.scala │ │ │ ├── IsValueClassTests.scala │ │ │ ├── JobTests.scala │ │ │ ├── JoinTests.scala │ │ │ ├── LitTests.scala │ │ │ ├── NumericTests.scala │ │ │ ├── OrderByTests.scala │ │ │ ├── RecordEncoderTests.scala │ │ │ ├── SchemaTests.scala │ │ │ ├── SelectTests.scala │ │ │ ├── SelfJoinTests.scala │ │ │ ├── TypedDatasetSuite.scala │ │ │ ├── UdtEncodedClass.scala │ │ │ ├── WithColumnTest.scala │ │ │ ├── WithColumnTupledTest.scala │ │ │ ├── XN.scala │ │ │ ├── forward/ │ │ │ │ ├── CheckpointTests.scala │ │ │ │ ├── ColumnsTests.scala │ │ │ │ ├── CountTests.scala │ │ │ │ ├── DistinctTests.scala │ │ │ │ ├── ExceptTests.scala │ │ │ │ ├── FirstTests.scala │ │ │ │ ├── ForeachTests.scala │ │ │ │ ├── HeadTests.scala │ │ │ │ ├── InputFilesTests.scala │ │ │ │ ├── IntersectTests.scala │ │ │ │ ├── IsLocalTests.scala │ │ │ │ ├── IsStreamingTests.scala │ │ │ │ ├── LimitTests.scala │ │ │ │ ├── QueryExecutionTests.scala │ │ │ │ ├── RandomSplitTests.scala │ │ │ │ ├── SQLContextTests.scala │ │ │ │ ├── SparkSessionTests.scala │ │ │ │ ├── StorageLevelTests.scala │ │ │ │ ├── TakeTests.scala │ │ │ │ ├── ToJSONTests.scala │ │ │ │ ├── ToLocalIteratorTests.scala │ │ │ │ ├── UnionTests.scala │ │ │ │ ├── WriteStreamTests.scala │ │ │ │ └── WriteTests.scala │ │ │ ├── functions/ │ │ │ │ ├── AggregateFunctionsTests.scala │ │ │ │ ├── DateTimeStringBehaviourUtils.scala │ │ │ │ ├── DoubleBehaviourUtils.scala │ │ │ │ ├── NonAggregateFunctionsTests.scala │ │ │ │ ├── UdfTests.scala │ │ │ │ └── UnaryFunctionsTest.scala │ │ │ ├── ops/ │ │ │ │ ├── ColumnTypesTest.scala │ │ │ │ ├── CubeTests.scala │ │ │ │ ├── PivotTest.scala │ │ │ │ ├── RepeatTest.scala │ │ │ │ ├── RollupTests.scala │ │ │ │ ├── SmartProjectTest.scala │ │ │ │ └── deserialized/ │ │ │ │ ├── FilterTests.scala │ │ │ │ ├── FlatMapTests.scala │ │ │ │ ├── MapPartitionsTests.scala │ │ │ │ ├── MapTests.scala │ │ │ │ └── ReduceTests.scala │ │ │ ├── package.scala │ │ │ ├── sql/ │ │ │ │ ├── package.scala │ │ │ │ └── rules/ │ │ │ │ └── SQLRulesSuite.scala │ │ │ └── syntax/ │ │ │ └── FramelessSyntaxTests.scala │ │ └── org/ │ │ └── apache/ │ │ └── hadoop/ │ │ └── fs/ │ │ └── local/ │ │ └── StreamingFS.scala │ ├── spark-3.2/ │ │ └── frameless/ │ │ └── sql/ │ │ └── rules/ │ │ └── FramelessLitPushDownTests.scala │ └── spark-3.3+/ │ └── frameless/ │ └── sql/ │ └── rules/ │ └── FramelessLitPushDownTests.scala ├── docs/ │ ├── Cats.md │ ├── FeatureOverview.md │ ├── Injection.md │ ├── Job.md │ ├── TypedDataFrame.md │ ├── TypedDatasetVsSparkDataset.md │ ├── TypedEncoder.md │ ├── TypedML.md │ ├── WorkingWithCsvParquetJson.md │ ├── directory.conf │ ├── iris.data │ └── iris.parquet ├── github.sbt ├── ml/ │ └── src/ │ ├── main/ │ │ └── scala/ │ │ ├── frameless/ │ │ │ └── ml/ │ │ │ ├── TypedEstimator.scala │ │ │ ├── TypedTransformer.scala │ │ │ ├── classification/ │ │ │ │ └── TypedRandomForestClassifier.scala │ │ │ ├── clustering/ │ │ │ │ ├── TypedBisectingKMeans.scala │ │ │ │ └── TypedKMeans.scala │ │ │ ├── feature/ │ │ │ │ ├── TypedIndexToString.scala │ │ │ │ ├── TypedStringIndexer.scala │ │ │ │ └── TypedVectorAssembler.scala │ │ │ ├── internals/ │ │ │ │ ├── LinearInputsChecker.scala │ │ │ │ ├── SelectorByValue.scala │ │ │ │ ├── TreesInputsChecker.scala │ │ │ │ ├── UnaryInputsChecker.scala │ │ │ │ └── VectorInputsChecker.scala │ │ │ ├── package.scala │ │ │ ├── params/ │ │ │ │ ├── kmeans/ │ │ │ │ │ └── KMeansInitMode.scala │ │ │ │ ├── linears/ │ │ │ │ │ ├── LossStrategy.scala │ │ │ │ │ └── Solver.scala │ │ │ │ └── trees/ │ │ │ │ └── FeatureSubsetStrategy.scala │ │ │ └── regression/ │ │ │ ├── TypedLinearRegression.scala │ │ │ └── TypedRandomForestRegressor.scala │ │ └── org/ │ │ └── apache/ │ │ └── spark/ │ │ └── ml/ │ │ └── FramelessInternals.scala │ └── test/ │ └── scala/ │ └── frameless/ │ └── ml/ │ ├── FramelessMlSuite.scala │ ├── Generators.scala │ ├── TypedEncoderInstancesTests.scala │ ├── classification/ │ │ ├── ClassificationIntegrationTests.scala │ │ └── TypedRandomForestClassifierTests.scala │ ├── clustering/ │ │ ├── BisectingKMeansTests.scala │ │ ├── ClusteringIntegrationTests.scala │ │ └── KMeansTests.scala │ ├── feature/ │ │ ├── TypedIndexToStringTests.scala │ │ ├── TypedStringIndexerTests.scala │ │ └── TypedVectorAssemblerTests.scala │ └── regression/ │ ├── RegressionIntegrationTests.scala │ ├── TypedLinearRegressionTests.scala │ └── TypedRandomForestRegressorTests.scala ├── project/ │ ├── Common.scala │ ├── build.properties │ └── plugins.sbt ├── refined/ │ └── src/ │ ├── main/ │ │ └── scala/ │ │ └── frameless/ │ │ └── refined/ │ │ ├── RefinedFieldEncoders.scala │ │ └── package.scala │ └── test/ │ └── scala/ │ └── frameless/ │ └── RefinedFieldEncoderTests.scala └── scripts/ ├── docs-build.sh ├── docs-publish.sh └── travis-publish.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/release-drafter.yml ================================================ name-template: 'v$NEXT_MINOR_VERSION' tag-template: 'v$NEXT_MINOR_VERSION' categories: - title: 'Added' labels: - 'feature' - title: 'Changed' labels: - 'enhancement' - 'dependency-update' - title: 'Fixed' labels: - 'fix' - 'bug' include-labels: - 'feature' - 'enhancement' - 'dependency-update' - 'fix' - 'bug' exclude-labels: - 'skip-changelog' - 'documentation' - 'build/process improvement' change-template: '- $TITLE [#$NUMBER](https://github.com/typelevel/frameless/pull/$NUMBER) (@$AUTHOR)' template: | $CHANGES ================================================ FILE: .github/workflows/ci.yml ================================================ # This file was automatically generated by sbt-github-actions using the # githubWorkflowGenerate task. You should add and commit this file to # your git repository. It goes without saying that you shouldn't edit # this file by hand! Instead, if you wish to make changes, you should # change your sbt build configuration to revise the workflow description # to meet your needs, then regenerate this file. name: Continuous Integration on: pull_request: branches: ['**', '!update/**', '!pr/**'] push: branches: ['**', '!update/**', '!pr/**'] tags: [v*] env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SBT_OPTS: '-Xms1g -Xmx4g' SPARK_LOCAL_IP: localhost concurrency: group: ${{ github.workflow }} @ ${{ github.ref }} cancel-in-progress: true jobs: build: name: Test strategy: matrix: os: [ubuntu-22.04] scala: [2.13, 2.12] java: [temurin@8] project: [root-spark33, root-spark34, root-spark35] exclude: - scala: 2.13 project: root-spark33 - scala: 2.13 project: root-spark34 runs-on: ${{ matrix.os }} timeout-minutes: 60 steps: - name: Checkout current branch (full) uses: actions/checkout@v6 with: fetch-depth: 0 - name: Setup sbt uses: sbt/setup-sbt@v1 - name: Setup Java (temurin@8) id: setup-java-temurin-8 if: matrix.java == 'temurin@8' uses: actions/setup-java@v5 with: distribution: temurin java-version: 8 cache: sbt - name: sbt update if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update - name: Check that workflows are up to date run: sbt githubWorkflowCheck - name: Check formatting if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04' run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' scalafmtCheckAll 'project /' scalafmtSbtCheck - name: Test & Compute Coverage run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' coverage test test/coverageReport - name: Check binary compatibility if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04' run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' mimaReportBinaryIssues - name: Generate API documentation if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04' run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' doc - uses: codecov/codecov-action@v3 with: flags: ${{ matrix.scala }}-${{ matrix.project }} publish: name: Publish Artifacts needs: [build] if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/master') strategy: matrix: os: [ubuntu-22.04] java: [temurin@8] runs-on: ${{ matrix.os }} steps: - name: Checkout current branch (full) uses: actions/checkout@v6 with: fetch-depth: 0 - name: Setup sbt uses: sbt/setup-sbt@v1 - name: Setup Java (temurin@8) id: setup-java-temurin-8 if: matrix.java == 'temurin@8' uses: actions/setup-java@v5 with: distribution: temurin java-version: 8 cache: sbt - name: sbt update if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update - name: Import signing key if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE == '' env: PGP_SECRET: ${{ secrets.PGP_SECRET }} PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} run: echo $PGP_SECRET | base64 -d -i - | gpg --import - name: Import signing key and strip passphrase if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE != '' env: PGP_SECRET: ${{ secrets.PGP_SECRET }} PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} run: | echo "$PGP_SECRET" | base64 -d -i - > /tmp/signing-key.gpg echo "$PGP_PASSPHRASE" | gpg --pinentry-mode loopback --passphrase-fd 0 --import /tmp/signing-key.gpg (echo "$PGP_PASSPHRASE"; echo; echo) | gpg --command-fd 0 --pinentry-mode loopback --change-passphrase $(gpg --list-secret-keys --with-colons 2> /dev/null | grep '^sec:' | cut --delimiter ':' --fields 5 | tail -n 1) - name: Publish env: SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }} run: sbt tlCiRelease dependency-submission: name: Submit Dependencies if: github.event.repository.fork == false && github.event_name != 'pull_request' strategy: matrix: os: [ubuntu-22.04] java: [temurin@8] runs-on: ${{ matrix.os }} steps: - name: Checkout current branch (full) uses: actions/checkout@v6 with: fetch-depth: 0 - name: Setup sbt uses: sbt/setup-sbt@v1 - name: Setup Java (temurin@8) id: setup-java-temurin-8 if: matrix.java == 'temurin@8' uses: actions/setup-java@v5 with: distribution: temurin java-version: 8 cache: sbt - name: sbt update if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update - name: Submit Dependencies uses: scalacenter/sbt-dependency-submission@v2 with: modules-ignore: root-spark33_2.13 root-spark33_2.12 docs_2.13 docs_2.12 root-spark34_2.13 root-spark34_2.12 root-spark35_2.13 root-spark35_2.12 configs-ignore: test scala-tool scala-doc-tool test-internal site: name: Generate Site strategy: matrix: os: [ubuntu-22.04] java: [temurin@11] runs-on: ${{ matrix.os }} steps: - name: Checkout current branch (full) uses: actions/checkout@v6 with: fetch-depth: 0 - name: Setup sbt uses: sbt/setup-sbt@v1 - name: Setup Java (temurin@8) id: setup-java-temurin-8 if: matrix.java == 'temurin@8' uses: actions/setup-java@v5 with: distribution: temurin java-version: 8 cache: sbt - name: sbt update if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false' run: sbt +update - name: Setup Java (temurin@11) id: setup-java-temurin-11 if: matrix.java == 'temurin@11' uses: actions/setup-java@v5 with: distribution: temurin java-version: 11 cache: sbt - name: sbt update if: matrix.java == 'temurin@11' && steps.setup-java-temurin-11.outputs.cache-hit == 'false' run: sbt +update - name: Generate site run: sbt docs/tlSite - name: Publish site if: github.event_name != 'pull_request' && github.ref == 'refs/heads/master' uses: peaceiris/actions-gh-pages@v4.0.0 with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: mdocs/target/docs/site keep_files: true ================================================ FILE: .github/workflows/clean.yml ================================================ # This file was automatically generated by sbt-github-actions using the # githubWorkflowGenerate task. You should add and commit this file to # your git repository. It goes without saying that you shouldn't edit # this file by hand! Instead, if you wish to make changes, you should # change your sbt build configuration to revise the workflow description # to meet your needs, then regenerate this file. name: Clean on: push jobs: delete-artifacts: name: Delete Artifacts runs-on: ubuntu-latest env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - name: Delete artifacts run: | # Customize those three lines with your repository and credentials: REPO=${GITHUB_API_URL}/repos/${{ github.repository }} # A shortcut to call GitHub API. ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; } # A temporary file which receives HTTP response headers. TMPFILE=/tmp/tmp.$$ # An associative array, key: artifact name, value: number of artifacts of that name. declare -A ARTCOUNT # Process all artifacts on this repository, loop on returned "pages". URL=$REPO/actions/artifacts while [[ -n "$URL" ]]; do # Get current page, get response headers in a temporary file. JSON=$(ghapi --dump-header $TMPFILE "$URL") # Get URL of next page. Will be empty if we are at the last page. URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*.*//') rm -f $TMPFILE # Number of artifacts on this page: COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') )) # Loop on all artifacts on this page. for ((i=0; $i < $COUNT; i++)); do # Get name of artifact and count instances of this name. name=$(jq <<<$JSON -r ".artifacts[$i].name?") ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1)) id=$(jq <<<$JSON -r ".artifacts[$i].id?") size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") )) printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size ghapi -X DELETE $REPO/actions/artifacts/$id done done ================================================ FILE: .github/workflows/release-drafter.yml ================================================ name: Release Drafter on: push: branches: - master pull_request: types: [opened, reopened, synchronize] jobs: update_release_draft: runs-on: ubuntu-latest steps: - uses: release-drafter/release-drafter@v5.15.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ *.class *.log # sbt specific .bsp/ dist/* target/ lib_managed/ src_managed/ project/boot/ project/plugins/project/ # Scala-IDE specific .scala_dependencies .cache .classpath .project .worksheet/ bin/ .settings/ .ensime .ensime_cache/ # IntelliJ specific .idea # OS X .DS_Store node_modules # VSCode .history .metals .vscode .bloop metals.sbt ================================================ FILE: .scalafmt.conf ================================================ version = 3.8.6 runner.dialect = scala213 newlines.beforeMultilineDef = keep newlines.topLevelStatements = [before] newlines.beforeCurlyLambdaParams = multilineWithCaseOnly newlines.afterCurlyLambdaParams = squash newlines.implicitParamListModifierForce = [after] newlines.avoidForSimpleOverflow = [tooLong] newlines.avoidInResultType = true newlines.sometimesBeforeColonInMethodReturnType = false newlines.beforeTypeBounds = keep verticalMultiline.atDefnSite = true verticalMultiline.arityThreshold = 10 spaces.inImportCurlyBraces = true includeCurlyBraceInSelectChains = false includeNoParensInSelectChains = false optIn.breakChainOnFirstMethodDot = false docstrings.style = Asterisk docstrings.wrap = no literals.long=Upper literals.float=Upper literals.double=Upper ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Frameless [![Workflow Badge](https://github.com/typelevel/frameless/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/typelevel/frameless/actions/workflows/ci.yml) [![Codecov Badge](https://codecov.io/gh/typelevel/frameless/branch/master/graph/badge.svg)](https://codecov.io/gh/typelevel/frameless) [![Discord Badge](https://img.shields.io/badge/chat-on%20discord-46BC99)](https://discord.gg/ZDZsxWcBJt) [![Maven Badge](https://img.shields.io/maven-central/v/org.typelevel/frameless-core_2.12?color=blue)](https://search.maven.org/search?q=g:org.typelevel%20and%20frameless) [![Snapshots Badge](https://img.shields.io/nexus/s/https/s01.oss.sonatype.org/org.typelevel/frameless-core_2.12)](https://s01.oss.sonatype.org/content/repositories/snapshots/org/typelevel/frameless-core_2.12/) Frameless is a Scala library for working with [Spark](http://spark.apache.org/) using more expressive types. It consists of the following modules: * `frameless-dataset` for a more strongly typed `Dataset`/`DataFrame` API * `frameless-ml` for a more strongly typed Spark ML API based on `frameless-dataset` * `frameless-cats` for using Spark's `RDD` API with [cats](https://github.com/typelevel/cats) Note that while Frameless is still getting off the ground, it is very possible that breaking changes will be made for at least the next few versions. The Frameless project and contributors support the [Typelevel](http://typelevel.org/) [Code of Conduct](http://typelevel.org/code-of-conduct.html) and want all its associated channels (e.g. GitHub, Discord) to be a safe and friendly environment for contributing and learning. ## Versions and dependencies The compatible versions of [Spark](http://spark.apache.org/) and [cats](https://github.com/typelevel/cats) are as follows: | Frameless | Spark | Cats | Cats-Effect | Scala | |-----------|-----------------------------|----------|-------------|-------------| | 0.16.0 | 3.5.0 / 3.4.0 / 3.3.0 | 2.x | 3.x | 2.12 / 2.13 | | 0.15.0 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 | | 0.14.1 | 3.4.0 / 3.3.0 / 3.2.2 | 2.x | 3.x | 2.12 / 2.13 | | 0.14.0 | 3.3.0 / 3.2.2 / 3.1.3 | 2.x | 3.x | 2.12 / 2.13 | | 0.13.0 | 3.3.0 / 3.2.2 / 3.1.3 | 2.x | 3.x | 2.12 / 2.13 | | 0.12.0 | 3.2.1 / 3.1.3 / 3.0.3 | 2.x | 3.x | 2.12 / 2.13 | | 0.11.1 | 3.2.0 / 3.1.2 / 3.0.1 | 2.x | 2.x | 2.12 / 2.13 | | 0.11.0* | 3.2.0 / 3.1.2 / 3.0.1 | 2.x | 2.x | 2.12 / 2.13 | | 0.10.1 | 3.1.0 | 2.x | 2.x | 2.12 | | 0.9.0 | 3.0.0 | 1.x | 1.x | 2.12 | | 0.8.0 | 2.4.0 | 1.x | 1.x | 2.11 / 2.12 | | 0.7.0 | 2.3.1 | 1.x | 1.x | 2.11 | | 0.6.1 | 2.3.0 | 1.x | 0.8 | 2.11 | | 0.5.2 | 2.2.1 | 1.x | 0.8 | 2.11 | | 0.4.1 | 2.2.0 | 1.x | 0.8 | 2.11 | | 0.4.0 | 2.2.0 | 1.0.0-IF | 0.4 | 2.11 | _\* 0.11.0 has broken Spark 3.1.2 and 3.0.1 artifacts published._ Starting 0.11 we introduced Spark cross published artifacts: * By default, frameless artifacts depend on the most recent Spark version * Suffix `-spark{major}{minor}` is added to artifacts that are released for the previous Spark version(s) Artifact names examples: * `frameless-dataset` (the latest Spark dependency) * `frameless-dataset-spark33` (Spark 3.3.x dependency) * `frameless-dataset-spark32` (Spark 3.2.x dependency) Versions 0.5.x and 0.6.x have identical features. The first is compatible with Spark 2.2.1 and the second with 2.3.0. The **only** dependency of the `frameless-dataset` module is on [shapeless](https://github.com/milessabin/shapeless) 2.3.2. Therefore, depending on `frameless-dataset`, has a minimal overhead on your Spark's application jar. Only the `frameless-cats` module depends on cats and cats-effect, so if you prefer to work just with `Datasets` and not with `RDD`s, you may choose not to depend on `frameless-cats`. Frameless intentionally **does not** have a compile dependency on Spark. This essentially allows you to use any version of Frameless with any version of Spark. The aforementioned table simply provides the versions of Spark we officially compile and test Frameless with, but other versions may probably work as well. ### Breaking changes in 0.9 * Spark 3 introduces a new ExpressionEncoder approach, the schema for single value DataFrame's is now ["value"](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala#L270) not "_1". ## Why? Frameless introduces a new Spark API, called `TypedDataset`. The benefits of using `TypedDataset` compared to the standard Spark `Dataset` API are as follows: * Typesafe columns referencing (e.g., no more runtime errors when accessing non-existing columns) * Customizable, typesafe encoders (e.g., if a type does not have an encoder, it should not compile) * Enhanced type signature for built-in functions (e.g., if you apply an arithmetic operation on a non-numeric column, you get a compilation error) * Typesafe casting and projections Click [here](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html) for a detailed comparison of `TypedDataset` with Spark's `Dataset` API. ## Documentation * [TypedDataset: Feature Overview](http://typelevel.org/frameless/FeatureOverview.html) * [Typed Spark ML](http://typelevel.org/frameless/TypedML.html) * [Comparing TypedDatasets with Spark's Datasets](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html) * [Typed Encoders in Frameless](http://typelevel.org/frameless/TypedEncoder.html) * [Injection: Creating Custom Encoders](http://typelevel.org/frameless/Injection.html) * [Job\[A\]](http://typelevel.org/frameless/Job.html) * [Using Cats with RDDs](http://typelevel.org/frameless/Cats.html) * [Proof of Concept: TypedDataFrame](http://typelevel.org/frameless/TypedDataFrame.html) ## Quick Start Since the 0.9.x release, Frameless is compiled only against Scala 2.12.x. To use Frameless in your project add the following in your `build.sbt` file as needed: ```scala val framelessVersion = "" resolvers ++= Seq( // for snapshot artifacts only "s01-oss-sonatype" at "https://s01.oss.sonatype.org/content/repositories/snapshots" ) libraryDependencies ++= List( "org.typelevel" %% "frameless-dataset" % framelessVersion, "org.typelevel" %% "frameless-ml" % framelessVersion, "org.typelevel" %% "frameless-cats" % framelessVersion ) ``` An easy way to bootstrap a Frameless sbt project: * if you have [Giter8][g8] installed then simply: ```bash g8 imarios/frameless.g8 ``` - with sbt >= 0.13.13: ```bash sbt new imarios/frameless.g8 ``` Typing `sbt console` inside your project will bring up a shell with Frameless and all its dependencies loaded (including Spark). ## Need help? Feel free to messages us on our [discord](https://discord.gg/ZDZsxWcBJt) channel for any issues/questions. ## Development We require at least _one_ sign-off (thumbs-up, +1, or similar) to merge pull requests. The current maintainers (people who can merge pull requests) are: * [adelbertc](https://github.com/adelbertc) * [imarios](https://github.com/imarios) * [kanterov](https://github.com/kanterov) * [non](https://github.com/non) * [OlivierBlanvillain](https://github.com/OlivierBlanvillain/) ### Testing Frameless contains several property tests. To avoid `OutOfMemoryError`s, we tune the default generator sizes. The following environment variables may be set to adjust the size of generated collections in the `TypedDataSet` suite: | Property | Default | |-----------------------------|--------:| | FRAMELESS_GEN_MIN_SIZE | 0 | | FRAMELESS_GEN_SIZE_RANGE | 20 | ## License Code is provided under the Apache 2.0 license available at , as well as in the LICENSE file. This is the same license used as Spark. [g8]: http://www.foundweekends.org/giter8/ ================================================ FILE: build.sbt ================================================ val sparkVersion = "3.5.8" val spark34Version = "3.4.4" val spark33Version = "3.3.4" val catsCoreVersion = "2.13.0" val catsEffectVersion = "3.7.0" val catsMtlVersion = "1.6.0" val scalatest = "3.2.20" val scalatestplus = "3.1.0.0-RC2" val shapeless = "2.3.13" val scalacheck = "1.19.0" val scalacheckEffect = "2.1.0" val refinedVersion = "0.11.3" val nakedFSVersion = "0.1.0" val Scala212 = "2.12.20" val Scala213 = "2.13.18" ThisBuild / tlBaseVersion := "0.16" ThisBuild / crossScalaVersions := Seq(Scala213, Scala212) ThisBuild / scalaVersion := Scala212 ThisBuild / coverageScalacPluginVersion := "2.3.0" lazy val root = project .in(file(".")) .enablePlugins(NoPublishPlugin) .settings(crossScalaVersions := Nil) .aggregate( `root-spark35`, `root-spark34`, `root-spark33`, docs ) lazy val `root-spark35` = project .in(file(".spark35")) .enablePlugins(NoPublishPlugin) .aggregate(core, cats, dataset, refined, ml) lazy val `root-spark34` = project .in(file(".spark34")) .enablePlugins(NoPublishPlugin) .aggregate( core, `cats-spark34`, `dataset-spark34`, `refined-spark34`, `ml-spark34` ) lazy val `root-spark33` = project .in(file(".spark33")) .enablePlugins(NoPublishPlugin) .aggregate( core, `cats-spark33`, `dataset-spark33`, `refined-spark33`, `ml-spark33` ) lazy val core = project.settings(name := "frameless-core").settings(framelessSettings) lazy val cats = project .settings(name := "frameless-cats") .settings(catsSettings) .dependsOn(dataset % "test->test;compile->compile;provided->provided") lazy val `cats-spark34` = project .settings(name := "frameless-cats-spark34") .settings(sourceDirectory := (cats / sourceDirectory).value) .settings(catsSettings) .settings(spark34Settings) .dependsOn( `dataset-spark34` % "test->test;compile->compile;provided->provided" ) lazy val `cats-spark33` = project .settings(name := "frameless-cats-spark33") .settings(sourceDirectory := (cats / sourceDirectory).value) .settings(catsSettings) .settings(spark33Settings) .dependsOn( `dataset-spark33` % "test->test;compile->compile;provided->provided" ) lazy val dataset = project .settings(name := "frameless-dataset") .settings( Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "spark-3.4+" ) .settings( Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "spark-3.3+" ) .settings(datasetSettings) .settings(sparkDependencies(sparkVersion)) .dependsOn(core % "test->test;compile->compile") lazy val `dataset-spark34` = project .settings(name := "frameless-dataset-spark34") .settings(sourceDirectory := (dataset / sourceDirectory).value) .settings( Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3.4+" ) .settings( Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+" ) .settings(datasetSettings) .settings(sparkDependencies(spark34Version)) .settings(spark34Settings) .dependsOn(core % "test->test;compile->compile") lazy val `dataset-spark33` = project .settings(name := "frameless-dataset-spark33") .settings(sourceDirectory := (dataset / sourceDirectory).value) .settings( Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3" ) .settings( Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+" ) .settings(datasetSettings) .settings(sparkDependencies(spark33Version)) .settings(spark33Settings) .dependsOn(core % "test->test;compile->compile") lazy val refined = project .settings(name := "frameless-refined") .settings(refinedSettings) .dependsOn(dataset % "test->test;compile->compile;provided->provided") lazy val `refined-spark34` = project .settings(name := "frameless-refined-spark34") .settings(sourceDirectory := (refined / sourceDirectory).value) .settings(refinedSettings) .settings(spark34Settings) .dependsOn( `dataset-spark34` % "test->test;compile->compile;provided->provided" ) lazy val `refined-spark33` = project .settings(name := "frameless-refined-spark33") .settings(sourceDirectory := (refined / sourceDirectory).value) .settings(refinedSettings) .settings(spark33Settings) .dependsOn( `dataset-spark33` % "test->test;compile->compile;provided->provided" ) lazy val ml = project .settings(name := "frameless-ml") .settings(mlSettings) .settings(sparkMlDependencies(sparkVersion)) .dependsOn( core % "test->test;compile->compile", dataset % "test->test;compile->compile;provided->provided" ) lazy val `ml-spark34` = project .settings(name := "frameless-ml-spark34") .settings(sourceDirectory := (ml / sourceDirectory).value) .settings(mlSettings) .settings(sparkMlDependencies(spark34Version)) .settings(spark34Settings) .dependsOn( core % "test->test;compile->compile", `dataset-spark34` % "test->test;compile->compile;provided->provided" ) lazy val `ml-spark33` = project .settings(name := "frameless-ml-spark33") .settings(sourceDirectory := (ml / sourceDirectory).value) .settings(mlSettings) .settings(sparkMlDependencies(spark33Version)) .settings(spark33Settings) .dependsOn( core % "test->test;compile->compile", `dataset-spark33` % "test->test;compile->compile;provided->provided" ) lazy val docs = project .in(file("mdocs")) .settings(framelessSettings) .settings(scalacOptions --= Seq("-Xfatal-warnings", "-Ywarn-unused-import")) .enablePlugins(TypelevelSitePlugin) .settings(sparkDependencies(sparkVersion, Compile)) .settings(sparkMlDependencies(sparkVersion, Compile)) .settings( addCompilerPlugin( "org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full ), scalacOptions += "-Ydelambdafy:inline", libraryDependencies += "org.typelevel" %% "mouse" % "1.3.2" ) .dependsOn(dataset, cats, ml) def sparkDependencies( sparkVersion: String, scope: Configuration = Provided ) = Seq( libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % sparkVersion % scope, "org.apache.spark" %% "spark-sql" % sparkVersion % scope ) ) def sparkMlDependencies(sparkVersion: String, scope: Configuration = Provided) = Seq( libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion % scope ) lazy val catsSettings = framelessSettings ++ Seq( addCompilerPlugin( "org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full ), libraryDependencies ++= Seq( "org.typelevel" %% "cats-core" % catsCoreVersion, "org.typelevel" %% "cats-effect" % catsEffectVersion, "org.typelevel" %% "cats-mtl" % catsMtlVersion, "org.typelevel" %% "alleycats-core" % catsCoreVersion, "org.typelevel" %% "scalacheck-effect" % scalacheckEffect % Test ) ) lazy val datasetSettings = framelessSettings ++ framelessTypedDatasetREPL ++ Seq( mimaBinaryIssueFilters ++= { import com.typesafe.tools.mima.core._ val imt = ProblemFilters.exclude[IncompatibleMethTypeProblem](_) val mc = ProblemFilters.exclude[MissingClassProblem](_) val dmm = ProblemFilters.exclude[DirectMissingMethodProblem](_) // TODO: Remove have version bump Seq( imt("frameless.TypedEncoder.mapEncoder"), imt("frameless.TypedEncoder.arrayEncoder"), imt("frameless.RecordEncoderFields.deriveRecordCons"), imt("frameless.RecordEncoderFields.deriveRecordLast"), mc("frameless.functions.FramelessLit"), mc(f"frameless.functions.FramelessLit$$"), dmm("frameless.functions.package.litAggr"), dmm("org.apache.spark.sql.FramelessInternals.column") ) }, coverageExcludedPackages := "org.apache.spark.sql.reflection", libraryDependencies += "com.globalmentor" % "hadoop-bare-naked-local-fs" % nakedFSVersion % Test exclude ( "org.apache.hadoop", "hadoop-commons" ) ) lazy val refinedSettings = framelessSettings ++ framelessTypedDatasetREPL ++ Seq( libraryDependencies += "eu.timepit" %% "refined" % refinedVersion ) lazy val mlSettings = framelessSettings ++ framelessTypedDatasetREPL lazy val scalac212Options = Seq( "-Xlint:-missing-interpolator,-unused,_", "-target:jvm-1.8", "-deprecation", "-encoding", "UTF-8", "-feature", "-unchecked", "-Xfatal-warnings", "-Yno-adapted-args", "-Ywarn-dead-code", "-Ywarn-numeric-widen", "-Ywarn-unused-import", "-Ywarn-value-discard", "-language:existentials", "-language:implicitConversions", "-language:higherKinds", "-Xfuture", "-Ypartial-unification" ) lazy val scalac213Options = { val exclusions = Set( "-Yno-adapted-args", "-Ywarn-unused-import", "-Xfuture", // type TraversableOnce in package scala is deprecated, symbol literal is deprecated; use Symbol("a") instead "-Xfatal-warnings", "-Ypartial-unification" ) // https://github.com/scala/bug/issues/12072 val options = Seq("-Xlint:-byname-implicit") scalac212Options.filter(s => !exclusions.contains(s)) ++ options } lazy val scalacOptionSettings = Def.setting { def baseScalacOptions(scalaVersion: String) = CrossVersion.partialVersion(scalaVersion) match { case Some((2, 13)) => scalac213Options case _ => scalac212Options } baseScalacOptions(scalaVersion.value) } lazy val framelessSettings = Seq( scalacOptions ++= scalacOptionSettings.value, Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oDF"), libraryDependencies ++= Seq( "com.chuusai" %% "shapeless" % shapeless, "org.scalatest" %% "scalatest" % scalatest % Test, "org.scalatestplus" %% "scalatestplus-scalacheck" % scalatestplus % Test, "org.scalacheck" %% "scalacheck" % scalacheck % Test ), Test / javaOptions ++= { val baseOptions = Seq("-Xmx1G", "-ea") val java17Options = if (sys.props("java.specification.version").toDouble >= 17.0) { Seq( "--add-opens=java.base/java.lang=ALL-UNNAMED", "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED", "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED", "--add-opens=java.base/java.io=ALL-UNNAMED", "--add-opens=java.base/java.net=ALL-UNNAMED", "--add-opens=java.base/java.nio=ALL-UNNAMED", "--add-opens=java.base/java.util=ALL-UNNAMED", "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED", "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED", "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED", "--add-opens=java.base/sun.security.action=ALL-UNNAMED", "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED" ) } else Seq.empty baseOptions ++ java17Options }, Test / fork := true, Test / parallelExecution := false, mimaPreviousArtifacts ~= { _.filterNot(_.revision == "0.11.0") // didn't release properly }, /** * The old Scala XML is pulled from Scala 2.12.x. * * [error] (update) found version conflict(s) in library dependencies; some are suspected to be binary incompatible: * [error] * [error] * org.scala-lang.modules:scala-xml_2.12:2.3.0 (early-semver) is selected over 1.0.6 * [error] +- org.scoverage:scalac-scoverage-reporter_2.12:2.0.7 (depends on 2.4.0) * [error] +- org.scala-lang:scala-compiler:2.12.16 (depends on 1.0.6) */ libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always ) ++ consoleSettings lazy val spark34Settings = Seq[Setting[_]]( tlVersionIntroduced := Map("2.12" -> "0.14.1", "2.13" -> "0.14.1"), mimaPreviousArtifacts := Set( organization.value %% moduleName.value .split("-") .dropRight(1) .mkString("-") % "0.14.1" ) ) lazy val spark33Settings = Seq[Setting[_]]( tlVersionIntroduced := Map("2.12" -> "0.13.0", "2.13" -> "0.13.0"), mimaPreviousArtifacts := Set( organization.value %% moduleName.value .split("-") .dropRight(1) .mkString("-") % "0.14.0" ) ) lazy val consoleSettings = Seq( Compile / console / scalacOptions ~= { _.filterNot("-Ywarn-unused-import" == _) }, Test / console / scalacOptions := (Compile / console / scalacOptions).value ) lazy val framelessTypedDatasetREPL = Seq( initialize ~= { _ => // Color REPL val ansi = System.getProperty("sbt.log.noformat", "false") != "true" if (ansi) System.setProperty("scala.color", "true") }, console / initialCommands := """ |import org.apache.spark.{SparkConf, SparkContext} |import org.apache.spark.sql.SparkSession |import frameless.functions.aggregate._ |import frameless.syntax._ | |val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false") |implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() | |import spark.implicits._ | |spark.sparkContext.setLogLevel("WARN") | |import frameless.TypedDataset """.stripMargin, console / cleanupCommands := """ |spark.stop() """.stripMargin ) ThisBuild / organization := "org.typelevel" ThisBuild / licenses := List( "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0") ) ThisBuild / developers := List( "OlivierBlanvillain" -> "Olivier Blanvillain", "adelbertc" -> "Adelbert Chang", "imarios" -> "Marios Iliofotou", "kanterov" -> "Gleb Kanterov", "non" -> "Erik Osheim", "jeremyrsmith" -> "Jeremy Smith", "cchantep" -> "Cédric Chantepie", "pomadchin" -> "Grigory Pomadchin" ).map { case (username, fullName) => tlGitHubDev(username, fullName) } ThisBuild / tlCiReleaseBranches := Seq("master") ThisBuild / tlSitePublishBranch := Some("master") val roots = List("root-spark33", "root-spark34", "root-spark35") ThisBuild / githubWorkflowBuildMatrixAdditions += "project" -> roots ThisBuild / githubWorkflowBuildMatrixExclusions ++= roots.init.map { project => MatrixExclude(Map("scala" -> "2.13", "project" -> project)) } ThisBuild / githubWorkflowEnv += "SBT_OPTS" -> "-Xms1g -Xmx4g" ================================================ FILE: cats/src/main/scala/frameless/cats/FramelessSyntax.scala ================================================ package frameless package cats import _root_.cats.effect.Sync import _root_.cats.syntax.all._ import _root_.cats.mtl.Ask import org.apache.spark.sql.SparkSession trait FramelessSyntax extends frameless.FramelessSyntax { implicit class SparkJobOps[F[_], A](fa: F[A])(implicit S: Sync[F], A: Ask[F, SparkSession]) { import S._, A._ def withLocalProperty(key: String, value: String): F[A] = for { session <- ask _ <- delay(session.sparkContext.setLocalProperty(key, value)) a <- fa } yield a def withGroupId(groupId: String): F[A] = withLocalProperty("spark.jobGroup.id", groupId) def withDescription(description: String): F[A] = withLocalProperty("spark.job.description", description) } } ================================================ FILE: cats/src/main/scala/frameless/cats/SparkDelayInstances.scala ================================================ package frameless package cats import _root_.cats.effect.Sync import org.apache.spark.sql.SparkSession trait SparkDelayInstances { implicit def framelessCatsSparkDelayForSync[F[_]](implicit S: Sync[F]): SparkDelay[F] = new SparkDelay[F] { def delay[A](a: => A)(implicit spark: SparkSession): F[A] = S.delay(a) } } ================================================ FILE: cats/src/main/scala/frameless/cats/SparkTask.scala ================================================ package frameless package cats import _root_.cats.Id import _root_.cats.data.Kleisli import org.apache.spark.SparkContext object SparkTask { def apply[A](f: SparkContext => A): SparkTask[A] = Kleisli[Id, SparkContext, A](f) def pure[A](a: => A): SparkTask[A] = Kleisli[Id, SparkContext, A](_ => a) } ================================================ FILE: cats/src/main/scala/frameless/cats/implicits.scala ================================================ package frameless package cats import _root_.cats._ import _root_.cats.kernel.{CommutativeMonoid, CommutativeSemigroup} import _root_.cats.syntax.all._ import alleycats.Empty import scala.reflect.ClassTag import org.apache.spark.rdd.RDD object implicits extends FramelessSyntax with SparkDelayInstances { implicit class rddOps[A: ClassTag](lhs: RDD[A]) { def csum(implicit m: CommutativeMonoid[A]): A = lhs.fold(m.empty)(_ |+| _) def csumOption(implicit m: CommutativeSemigroup[A]): Option[A] = lhs.aggregate[Option[A]](None)( (acc, a) => Some(acc.fold(a)(_ |+| a)), (l, r) => l.fold(r)(x => r.map(_ |+| x) orElse Some(x)) ) def cmin(implicit o: Order[A], e: Empty[A]): A = { if (lhs.isEmpty()) e.empty else lhs.reduce(_ min _) } def cminOption(implicit o: Order[A]): Option[A] = csumOption(new CommutativeSemigroup[A] { def combine(l: A, r: A) = l min r }) def cmax(implicit o: Order[A], e: Empty[A]): A = { if (lhs.isEmpty()) e.empty else lhs.reduce(_ max _) } def cmaxOption(implicit o: Order[A]): Option[A] = csumOption(new CommutativeSemigroup[A] { def combine(l: A, r: A) = l max r }) } implicit class pairRddOps[K: ClassTag, V: ClassTag](lhs: RDD[(K, V)]) { def csumByKey(implicit m: CommutativeSemigroup[V]): RDD[(K, V)] = lhs.reduceByKey(_ |+| _) def cminByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ min _) def cmaxByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ max _) } } object union { implicit def unionSemigroup[A]: Semigroup[RDD[A]] = new Semigroup[RDD[A]] { def combine(lhs: RDD[A], rhs: RDD[A]): RDD[A] = lhs union rhs } } object inner { implicit def pairwiseInnerSemigroup[K: ClassTag, V: ClassTag: Semigroup]: Semigroup[RDD[(K, V)]] = new Semigroup[RDD[(K, V)]] { def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] = lhs.join(rhs).mapValues { case (x, y) => x |+| y } } } object outer { implicit def pairwiseOuterSemigroup[K: ClassTag, V: ClassTag](implicit m: Monoid[V]): Semigroup[RDD[(K, V)]] = new Semigroup[RDD[(K, V)]] { def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] = lhs.fullOuterJoin(rhs).mapValues { case (Some(x), Some(y)) => x |+| y case (None, Some(y)) => y case (Some(x), None) => x case (None, None) => m.empty } } } ================================================ FILE: cats/src/main/scala/frameless/cats/package.scala ================================================ package frameless import _root_.cats.Id import _root_.cats.data.Kleisli import org.apache.spark.SparkContext package object cats { type SparkTask[A] = Kleisli[Id, SparkContext, A] } ================================================ FILE: cats/src/test/resources/log4j.properties ================================================ log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR log4j.logger.akka.event.slf4j=ERROR log4j.logger.akka.remote.EndpointWriter=ERROR log4j.logger.akka.remote.RemoteActorRefProvider$RemotingTerminator=ERROR log4j.logger.com.anjuke.dm=ERROR log4j.logger.io.netty.bootstrap.ServerBootstrap=ERROR log4j.logger.io.netty.buffer.ByteBufUtil=ERROR log4j.logger.io.netty.buffer.PooledByteBufAllocator=ERROR log4j.logger.io.netty.channel.AbstractChannel=ERROR log4j.logger.io.netty.channel.ChannelInitializer=ERROR log4j.logger.io.netty.channel.ChannelOutboundBuffer=ERROR log4j.logger.io.netty.channel.DefaultChannelPipeline=ERROR log4j.logger.io.netty.channel.MultithreadEventLoopGroup=ERROR log4j.logger.io.netty.channel.nio.AbstractNioChannel=ERROR log4j.logger.io.netty.channel.nio.NioEventLoop=ERROR log4j.logger.io.netty.channel.socket.nio.NioServerSocketChannel=ERROR log4j.logger.io.netty.util.concurrent.DefaultPromise.rejectedExecution=ERROR log4j.logger.io.netty.util.concurrent.DefaultPromise=ERROR log4j.logger.io.netty.util.concurrent.GlobalEventExecutor=ERROR log4j.logger.io.netty.util.concurrent.SingleThreadEventExecutor=ERROR log4j.logger.io.netty.util.internal.logging.InternalLoggerFactory=ERROR log4j.logger.io.netty.util.internal.PlatformDependent0=ERROR log4j.logger.io.netty.util.internal.PlatformDependent=ERROR log4j.logger.io.netty.util.internal.SystemPropertyUtil=ERROR log4j.logger.io.netty.util.internal.ThreadLocalRandom=ERROR log4j.logger.io.netty.util.NetUtil=ERROR log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR log4j.logger.org.apache.hadoop.conf.Configuration=ERROR log4j.logger.org.apache.hadoop.fs.FileSystem=ERROR log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=ERROR log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR log4j.logger.org.apache.hadoop.mapred.JobConf=ERROR log4j.logger.org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedPartitioner=ERROR log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.Interns=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.MetricsSourceBuilder=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.MutableMetricsFactory=ERROR log4j.logger.org.apache.hadoop.security.authentication.util.KerberosName=ERROR log4j.logger.org.apache.hadoop.security.Groups=ERROR log4j.logger.org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback=ERROR log4j.logger.org.apache.hadoop.security.SecurityUtil=ERROR log4j.logger.org.apache.hadoop.security.ShellBasedUnixGroupsMapping=ERROR log4j.logger.org.apache.hadoop.security.UserGroupInformation=ERROR log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR log4j.logger.org.apache.hadoop.util.ShutdownHookManager=ERROR log4j.logger.org.apache.spark.broadcast.TorrentBroadcast=ERROR log4j.logger.org.apache.spark.ContextCleaner=ERROR log4j.logger.org.apache.spark.executor.Executor=ERROR log4j.logger.org.apache.spark.HeartbeatReceiver=ERROR log4j.logger.org.apache.spark.HttpFileServer=ERROR log4j.logger.org.apache.spark.HttpServer=ERROR log4j.logger.org.apache.spark.MapOutputTrackerMaster=ERROR log4j.logger.org.apache.spark.MapOutputTrackerMasterEndpoint=ERROR log4j.logger.org.apache.spark.metrics.MetricsSystem=ERROR log4j.logger.org.apache.spark.network.client.TransportClientFactory=ERROR log4j.logger.org.apache.spark.network.netty.NettyBlockTransferService=ERROR log4j.logger.org.apache.spark.network.protocol.MessageDecoder=ERROR log4j.logger.org.apache.spark.network.protocol.MessageEncoder=ERROR log4j.logger.org.apache.spark.network.server.OneForOneStreamManager=ERROR log4j.logger.org.apache.spark.network.server.TransportServer=ERROR log4j.logger.org.apache.spark.network.TransportContext=ERROR log4j.logger.org.apache.spark.network.util.JavaUtils=ERROR log4j.logger.org.apache.spark.rdd.CoGroupedRDD=ERROR log4j.logger.org.apache.spark.rdd.SubtractedRDD=ERROR log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR log4j.logger.org.apache.spark.rpc.akka.AkkaRpcEnv$$anonfun$actorRef$lzycompute$1$1$$anon$1=ERROR log4j.logger.org.apache.spark.scheduler.DAGScheduler=ERROR log4j.logger.org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint=ERROR log4j.logger.org.apache.spark.scheduler.TaskSchedulerImpl=ERROR log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR log4j.logger.org.apache.spark.SecurityManager=ERROR log4j.logger.org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter=ERROR log4j.logger.org.apache.spark.SparkContext=ERROR log4j.logger.org.apache.spark.SparkEnv=ERROR log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer=ERROR log4j.logger.org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer=ERROR log4j.logger.org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys=ERROR log4j.logger.org.apache.spark.sql.execution.aggregate.SortBasedAggregate=ERROR log4j.logger.org.apache.spark.sql.execution.aggregate.TungstenAggregate=ERROR log4j.logger.org.apache.spark.sql.execution.Exchange=ERROR log4j.logger.org.apache.spark.sql.execution.joins.ShuffledHashOuterJoin=ERROR log4j.logger.org.apache.spark.sql.SQLContext$$anon$1=ERROR log4j.logger.org.apache.spark.sql.SQLContext$$anon$2=ERROR log4j.logger.org.apache.spark.SSLOptions=ERROR log4j.logger.org.apache.spark.storage.BlockManager=ERROR log4j.logger.org.apache.spark.storage.BlockManagerInfo=ERROR log4j.logger.org.apache.spark.storage.BlockManagerMaster=ERROR log4j.logger.org.apache.spark.storage.BlockManagerMasterEndpoint=ERROR log4j.logger.org.apache.spark.storage.BlockManagerSlaveEndpoint=ERROR log4j.logger.org.apache.spark.storage.DiskBlockManager=ERROR log4j.logger.org.apache.spark.storage.MemoryStore=ERROR log4j.logger.org.apache.spark.storage.ShuffleBlockFetcherIterator=ERROR log4j.logger.org.apache.spark.ui.SparkUI=ERROR log4j.logger.org.apache.spark.unsafe.map.BytesToBytesMap=ERROR log4j.logger.org.apache.spark.unsafe.memory.TaskMemoryManager=ERROR log4j.logger.org.apache.spark.util.AkkaUtils=ERROR log4j.logger.org.apache.spark.util.ClosureCleaner=ERROR log4j.logger.org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter=ERROR log4j.logger.org.apache.spark.util.Utils=ERROR log4j.logger.org.apache.spark=ERROR log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.eclipse.jetty=ERROR log4j.logger.org.spark-project.jetty.http.AbstractGenerator=ERROR log4j.logger.org.spark-project.jetty.http.HttpGenerator=ERROR log4j.logger.org.spark-project.jetty.http.MimeTypes=ERROR log4j.logger.org.spark-project.jetty.io.AbstractBuffer=ERROR log4j.logger.org.spark-project.jetty.io.nio=ERROR log4j.logger.org.spark-project.jetty.server.AbstractConnector=ERROR log4j.logger.org.spark-project.jetty.server.bio.SocketConnector=ERROR log4j.logger.org.spark-project.jetty.server.handler.AbstractHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ContextHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ContextHandlerCollection=ERROR log4j.logger.org.spark-project.jetty.server.handler.DefaultHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ErrorHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.GzipHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ResourceHandler=ERROR log4j.logger.org.spark-project.jetty.server.Server=ERROR log4j.logger.org.spark-project.jetty.server=ERROR log4j.logger.org.spark-project.jetty.servlet.DefaultServlet=ERROR log4j.logger.org.spark-project.jetty.servlet.Holder=ERROR log4j.logger.org.spark-project.jetty.servlet.ServletHandler=ERROR log4j.logger.org.spark-project.jetty.servlet.ServletHolder=ERROR log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.spark-project.jetty.util.component.AggregateLifeCycle=ERROR log4j.logger.org.spark-project.jetty.util.component.Container=ERROR log4j.logger.org.spark-project.jetty.util.IO=ERROR log4j.logger.org.spark-project.jetty.util.log=ERROR log4j.logger.org.spark-project.jetty.util.resource.FileResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.JarFileResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.JarResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.Resource=ERROR log4j.logger.org.spark-project.jetty.util.resource.URLResource=ERROR log4j.logger.org.spark-project.jetty.util.StringUtil=ERROR log4j.logger.org.spark-project.jetty.util.thread.QueuedThreadPool=ERROR log4j.logger.org.spark-project.jetty.util.thread.Timeout=ERROR log4j.logger.org.spark-project.jetty=ERROR log4j.logger.Remoting=ERROR ================================================ FILE: cats/src/test/resources/log4j2.properties ================================================ # Set to debug or trace if log4j initialization is failing status = warn # Name of the configuration name = ConsoleAppender # Console appender configuration appender.console.type = Console appender.console.name = consoleLogger appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{YYYY-MM-dd HH:mm:ss} [%t] %-5p %c:%L - %m%n appender.console.target = SYSTEM_OUT # Root logger level rootLogger.level = error # Root logger referring to console appender rootLogger.appenderRef.stdout.ref = consoleLogger logger.spark.name = org.apache.spark logger.spark.level = warn logger.hadoop.name = org.apache.hadoop logger.hadoop.level = warn ================================================ FILE: cats/src/test/scala/frameless/cats/FramelessSyntaxTests.scala ================================================ package frameless package cats import _root_.cats.data.ReaderT import _root_.cats.effect.IO import _root_.cats.effect.unsafe.implicits.global import org.apache.spark.sql.SparkSession import org.scalatest.matchers.should.Matchers import org.scalacheck.{Test => PTest} import org.scalacheck.Prop, Prop._ import org.scalacheck.effect.PropF, PropF._ class FramelessSyntaxTests extends TypedDatasetSuite with Matchers { override val sparkDelay = null def prop[A, B](data: Vector[X2[A, B]])( implicit ev: TypedEncoder[X2[A, B]] ): Prop = { import implicits._ val dataset = TypedDataset.create(data).dataset val dataframe = dataset.toDF() val typedDataset = dataset.typed val typedDatasetFromDataFrame = dataframe.unsafeTyped[X2[A, B]] typedDataset.collect[IO]().unsafeRunSync().toVector ?= typedDatasetFromDataFrame.collect[IO]().unsafeRunSync().toVector } test("dataset typed - toTyped") { check(forAll(prop[Int, String] _)) } test("properties can be read back") { import implicits._ import _root_.cats.syntax.all._ forAllF { (k: String, v: String) => val scopedKey = "frameless.tests." + k 1 .pure[ReaderT[IO, SparkSession, *]] .withLocalProperty(scopedKey, v) .withGroupId(v) .withDescription(v) .run(session) .map { _ => sc.getLocalProperty(scopedKey) shouldBe v sc.getLocalProperty("spark.jobGroup.id") shouldBe v sc.getLocalProperty("spark.job.description") shouldBe v }.void }.check().unsafeRunSync().status shouldBe PTest.Passed } } ================================================ FILE: cats/src/test/scala/frameless/cats/test.scala ================================================ package frameless package cats import _root_.cats.Foldable import _root_.cats.syntax.all._ import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext => SC} import org.scalatest.compatible.Assertion import org.scalactic.anyvals.PosInt import org.scalacheck.Arbitrary import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks import Arbitrary._ import scala.collection.immutable.SortedMap import scala.reflect.ClassTag import org.scalatest.matchers.should.Matchers import org.scalatest.propspec.AnyPropSpec trait SparkTests { val appID: String = new java.util.Date().toString + math.floor(math.random() * 10E4).toLong.toString val conf: SparkConf = new SparkConf() .setMaster("local[*]") .setAppName("test") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) implicit def session: SparkSession = SparkSession.builder().config(conf).getOrCreate() implicit def sc: SparkContext = session.sparkContext implicit class seqToRdd[A: ClassTag](seq: Seq[A])(implicit sc: SC) { def toRdd: RDD[A] = sc.makeRDD(seq) } } object Tests { def innerPairwise(mx: Map[String, Int], my: Map[String, Int], check: (Any, Any) => Assertion)(implicit sc: SC): Assertion = { import frameless.cats.implicits._ import frameless.cats.inner._ val xs = sc.parallelize(mx.toSeq) val ys = sc.parallelize(my.toSeq) val mz0 = (xs |+| ys).collectAsMap() val mz1 = (xs join ys).mapValues { case (x, y) => x |+| y }.collectAsMap() val mz2 = (for { (k, x) <- mx; y <- my.get(k) } yield (k, x + y)).toMap check(mz0, mz1) check(mz1, mz2) val zs = sc.parallelize(mx.values.toSeq) check(xs.csumByKey.collectAsMap(), mx) check(zs.csum, zs.collect().sum) if (mx.nonEmpty) { check(xs.cminByKey.collectAsMap(), mx) check(xs.cmaxByKey.collectAsMap(), mx) check(zs.cmin, zs.collect().min) check(zs.cmax, zs.collect().max) } else check(1, 1) } } class Test extends AnyPropSpec with Matchers with ScalaCheckPropertyChecks with SparkTests { implicit override val generatorDrivenConfig = PropertyCheckConfiguration(minSize = PosInt(10)) property("spark is working") { sc.parallelize(Seq(1, 2, 3)).collect() shouldBe Array(1,2,3) } property("inner pairwise monoid") { // Make sure we have non-empty map forAll { (xh: (String, Int), mx: Map[String, Int], yh: (String, Int), my: Map[String, Int]) => Tests.innerPairwise(mx + xh, my + yh, _ shouldBe _) } } property("rdd simple numeric commutative semigroup") { import frameless.cats.implicits._ forAll { seq: List[Int] => val expectedSum = if (seq.isEmpty) None else Some(seq.sum) val expectedMin = if (seq.isEmpty) None else Some(seq.min) val expectedMax = if (seq.isEmpty) None else Some(seq.max) val rdd = seq.toRdd rdd.cmin shouldBe expectedMin.getOrElse(0) rdd.cminOption shouldBe expectedMin rdd.cmax shouldBe expectedMax.getOrElse(0) rdd.cmaxOption shouldBe expectedMax rdd.csum shouldBe expectedSum.getOrElse(0) rdd.csumOption shouldBe expectedSum } } property("rdd of SortedMap[Int,Int] commutative monoid") { import frameless.cats.implicits._ forAll { seq: List[SortedMap[Int, Int]] => val rdd = seq.toRdd rdd.csum shouldBe Foldable[List].fold(seq) } } property("rdd tuple commutative semigroup example") { import frameless.cats.implicits._ forAll { seq: List[(Int, Int)] => val expectedSum = if (seq.isEmpty) None else Some(Foldable[List].fold(seq)) val rdd = seq.toRdd rdd.csum shouldBe expectedSum.getOrElse(0 -> 0) rdd.csumOption shouldBe expectedSum } } property("pair rdd numeric commutative semigroup example") { import frameless.cats.implicits._ val seq = Seq( ("a",2), ("b",3), ("d",6), ("b",2), ("d",1) ) val rdd = seq.toRdd rdd.cminByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",2), ("d",1) ) rdd.cmaxByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",3), ("d",6) ) rdd.csumByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",5), ("d",7) ) } } ================================================ FILE: core/src/main/scala/frameless/CatalystAverageable.scala ================================================ package frameless import scala.annotation.implicitNotFound /** * When averaging Spark doesn't change these types: * - BigDecimal -> BigDecimal * - Double -> Double * But it changes these types : * - Int -> Double * - Short -> Double * - Long -> Double */ @implicitNotFound("Cannot compute average of type ${In}.") trait CatalystAverageable[In, Out] object CatalystAverageable { private[this] val theInstance = new CatalystAverageable[Any, Any] {} private[this] def of[In, Out]: CatalystAverageable[In, Out] = theInstance.asInstanceOf[CatalystAverageable[In, Out]] implicit val framelessAverageableBigDecimal: CatalystAverageable[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal] implicit val framelessAverageableDouble: CatalystAverageable[Double, Double] = of[Double, Double] implicit val framelessAverageableLong: CatalystAverageable[Long, Double] = of[Long, Double] implicit val framelessAverageableInt: CatalystAverageable[Int, Double] = of[Int, Double] implicit val framelessAverageableShort: CatalystAverageable[Short, Double] = of[Short, Double] } ================================================ FILE: core/src/main/scala/frameless/CatalystBitShift.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Spark does not return always Int on shift */ @implicitNotFound("Cannot do bit shift operations on columns of type ${In}.") trait CatalystBitShift[In, Out] object CatalystBitShift { private[this] val theInstance = new CatalystBitShift[Any, Any] {} private[this] def of[In, Out]: CatalystBitShift[In, Out] = theInstance.asInstanceOf[CatalystBitShift[In, Out]] implicit val framelessBitShiftBigDecimal: CatalystBitShift[BigDecimal, Int] = of[BigDecimal, Int] implicit val framelessBitShiftDouble : CatalystBitShift[Byte, Int] = of[Byte, Int] implicit val framelessBitShiftInt : CatalystBitShift[Short, Int] = of[Short, Int] implicit val framelessBitShiftLong : CatalystBitShift[Int, Int] = of[Int, Int] implicit val framelessBitShiftShort : CatalystBitShift[Long, Long] = of[Long, Long] } ================================================ FILE: core/src/main/scala/frameless/CatalystBitwise.scala ================================================ package frameless import scala.annotation.implicitNotFound /** * Types that can be bitwise ORed, ANDed, or XORed by Catalyst. * Note that Catalyst requires that when performing bitwise operations between columns * the two types must be the same so in some cases casting is necessary. */ @implicitNotFound("Cannot do bitwise operations on columns of type ${A}.") trait CatalystBitwise[A] extends CatalystNumeric[A] object CatalystBitwise { private[this] val theInstance = new CatalystBitwise[Any] {} private[this] def of[A]: CatalystBitwise[A] = theInstance.asInstanceOf[CatalystBitwise[A]] implicit val framelessbyteBitwise: CatalystBitwise[Byte] = of[Byte] implicit val framelessshortBitwise: CatalystBitwise[Short] = of[Short] implicit val framelessintBitwise: CatalystBitwise[Int] = of[Int] implicit val framelesslongBitwise: CatalystBitwise[Long] = of[Long] } ================================================ FILE: core/src/main/scala/frameless/CatalystCast.scala ================================================ package frameless trait CatalystCast[A, B] object CatalystCast { private[this] val theInstance = new CatalystCast[Any, Any] {} private[this] def of[A, B]: CatalystCast[A, B] = theInstance.asInstanceOf[CatalystCast[A, B]] implicit def framelessCastToString[T]: CatalystCast[T, String] = of[T, String] implicit def framelessNumericToLong [A: CatalystNumeric]: CatalystCast[A, Long] = of[A, Long] implicit def framelessNumericToInt [A: CatalystNumeric]: CatalystCast[A, Int] = of[A, Int] implicit def framelessNumericToShort [A: CatalystNumeric]: CatalystCast[A, Short] = of[A, Short] implicit def framelessNumericToByte [A: CatalystNumeric]: CatalystCast[A, Byte] = of[A, Byte] implicit def framelessNumericToDecimal[A: CatalystNumeric]: CatalystCast[A, BigDecimal] = of[A, BigDecimal] implicit def framelessNumericToDouble [A: CatalystNumeric]: CatalystCast[A, Double] = of[A, Double] implicit def framelessBooleanToNumeric[A: CatalystNumeric]: CatalystCast[Boolean, A] = of[Boolean, A] // doesn't make any sense to include: // - sqlDateToBoolean: always None // - sqlTimestampToBoolean: compares us to 0 implicit val framelessStringToBoolean : CatalystCast[String, Option[Boolean]] = of[String, Option[Boolean]] implicit val framelessLongToBoolean : CatalystCast[Long, Boolean] = of[Long, Boolean] implicit val framelessIntToBoolean : CatalystCast[Int, Boolean] = of[Int, Boolean] implicit val framelessShortToBoolean : CatalystCast[Short, Boolean] = of[Short, Boolean] implicit val framelessByteToBoolean : CatalystCast[Byte, Boolean] = of[Byte, Boolean] implicit val framelessBigDecimalToBoolean: CatalystCast[BigDecimal, Boolean] = of[BigDecimal, Boolean] implicit val framelessDoubleToBoolean : CatalystCast[Double, Boolean] = of[Double, Boolean] // TODO // needs verification, does it make sense to include? probably better as a separate function // implicit object stringToInt extends CatalystCast[String, Option[Int]] // implicit object stringToShort extends CatalystCast[String, Option[Short]] // implicit object stringToByte extends CatalystCast[String, Option[Byte]] // implicit object stringToDecimal extends CatalystCast[String, Option[BigDecimal]] // implicit object stringToLong extends CatalystCast[String, Option[Long]] // implicit object stringToSqlDate extends CatalystCast[String, Option[SQLDate]] // needs verification: //implicit object sqlTimestampToSqlDate extends CatalystCast[SQLTimestamp, SQLDate] // needs verification: // implicit object sqlTimestampToDecimal extends CatalystCast[SQLTimestamp, BigDecimal] // implicit object sqlTimestampToLong extends CatalystCast[SQLTimestamp, Long] // needs verification: // implicit object stringToSqlTimestamp extends CatalystCast[String, SQLTimestamp] // implicit object longToSqlTimestamp extends CatalystCast[Long, SQLTimestamp] // implicit object intToSqlTimestamp extends CatalystCast[Int, SQLTimestamp] // implicit object doubleToSqlTimestamp extends CatalystCast[Double, SQLTimestamp] // implicit object floatToSqlTimestamp extends CatalystCast[Float, SQLTimestamp] // implicit object bigDecimalToSqlTimestamp extends CatalystCast[BigDecimal, SQLTimestamp] // implicit object sqlDateToSqlTimestamp extends CatalystCast[SQLDate, SQLTimestamp] // doesn't make sense to include: // - booleanToSqlTimestamp: 1L or 0L // - shortToSqlTimestamp: ??? // - byteToSqlTimestamp: ??? // doesn't make sense to include: // - sqlDateToLong: always None // - sqlDateToInt: always None // - sqlDateToInt: always None // - sqlDateToInt: always None // - sqlDateToInt: always None // doesn't make sense to include: // - sqlTimestampToInt: useful? can be done through `-> Long -> Int` // - sqlTimestampToShort: useful? can be done through `-> Long -> Int` // - sqlTimestampToShort: useful? can be done through `-> Long -> Int` } ================================================ FILE: core/src/main/scala/frameless/CatalystCollection.scala ================================================ package frameless import scala.annotation.implicitNotFound @implicitNotFound("Cannot do collection operations on columns of type ${C}.") trait CatalystCollection[C[_]] object CatalystCollection { private[this] val theInstance = new CatalystCollection[Any] {} private[this] def of[A[_]]: CatalystCollection[A] = theInstance.asInstanceOf[CatalystCollection[A]] implicit val arrayObject : CatalystCollection[Array] = of[Array] implicit val seqObject : CatalystCollection[Seq] = of[Seq] implicit val listObject : CatalystCollection[List] = of[List] implicit val vectorObject: CatalystCollection[Vector] = of[Vector] } ================================================ FILE: core/src/main/scala/frameless/CatalystDivisible.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Spark divides everything as Double, expect BigDecimals are divided into * another BigDecimal, benefiting from some added precision. */ @implicitNotFound("Cannot compute division on type ${In}.") trait CatalystDivisible[In, Out] object CatalystDivisible { private[this] val theInstance = new CatalystDivisible[Any, Any] {} private[this] def of[In, Out]: CatalystDivisible[In, Out] = theInstance.asInstanceOf[CatalystDivisible[In, Out]] implicit val framelessDivisibleBigDecimal: CatalystDivisible[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal] implicit val framelessDivisibleDouble : CatalystDivisible[Double, Double] = of[Double, Double] implicit val framelessDivisibleInt : CatalystDivisible[Int, Double] = of[Int, Double] implicit val framelessDivisibleLong : CatalystDivisible[Long, Double] = of[Long, Double] implicit val framelessDivisibleByte : CatalystDivisible[Byte, Double] = of[Byte, Double] implicit val framelessDivisibleShort : CatalystDivisible[Short, Double] = of[Short, Double] } ================================================ FILE: core/src/main/scala/frameless/CatalystIsin.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Types for which we can check if is in */ @implicitNotFound("Cannot do isin operation on columns of type ${A}.") trait CatalystIsin[A] object CatalystIsin { implicit object framelessBigDecimal extends CatalystIsin[BigDecimal] implicit object framelessByte extends CatalystIsin[Byte] implicit object framelessDouble extends CatalystIsin[Double] implicit object framelessFloat extends CatalystIsin[Float] implicit object framelessInt extends CatalystIsin[Int] implicit object framelessLong extends CatalystIsin[Long] implicit object framelessShort extends CatalystIsin[Short] implicit object framelesssString extends CatalystIsin[String] } ================================================ FILE: core/src/main/scala/frameless/CatalystNaN.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Spark does NaN check only for these types */ @implicitNotFound("Columns of type ${A} cannot be NaN.") trait CatalystNaN[A] object CatalystNaN { private[this] val theInstance = new CatalystNaN[Any] {} private[this] def of[A]: CatalystNaN[A] = theInstance.asInstanceOf[CatalystNaN[A]] implicit val framelessFloatNaN : CatalystNaN[Float] = of[Float] implicit val framelessDoubleNaN : CatalystNaN[Double] = of[Double] } ================================================ FILE: core/src/main/scala/frameless/CatalystNotNullable.scala ================================================ package frameless import scala.annotation.implicitNotFound @implicitNotFound("Cannot find evidence that type ${A} is nullable. Currently, only Option[A] is nullable.") trait CatalystNullable[A] object CatalystNullable { implicit def optionIsNullable[A]: CatalystNullable[Option[A]] = new CatalystNullable[Option[A]] {} } @implicitNotFound("Cannot find evidence that type ${A} is not nullable.") trait NotCatalystNullable[A] object NotCatalystNullable { implicit def everythingIsNotNullable[A]: NotCatalystNullable[A] = new NotCatalystNullable[A] {} implicit def nullableIsNotNotNullable[A: CatalystNullable]: NotCatalystNullable[A] = new NotCatalystNullable[A] {} } ================================================ FILE: core/src/main/scala/frameless/CatalystNumeric.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Types that can be added, subtracted and multiplied by Catalyst. */ @implicitNotFound("Cannot do numeric operations on columns of type ${A}.") trait CatalystNumeric[A] object CatalystNumeric { private[this] val theInstance = new CatalystNumeric[Any] {} private[this] def of[A]: CatalystNumeric[A] = theInstance.asInstanceOf[CatalystNumeric[A]] implicit val framelessbigDecimalNumeric: CatalystNumeric[BigDecimal] = of[BigDecimal] implicit val framelessbyteNumeric : CatalystNumeric[Byte] = of[Byte] implicit val framelessdoubleNumeric : CatalystNumeric[Double] = of[Double] implicit val framelessintNumeric : CatalystNumeric[Int] = of[Int] implicit val framelesslongNumeric : CatalystNumeric[Long] = of[Long] implicit val framelessshortNumeric : CatalystNumeric[Short] = of[Short] } ================================================ FILE: core/src/main/scala/frameless/CatalystNumericWithJavaBigDecimal.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Spark does not return always the same type as the input was for example abs */ @implicitNotFound("Cannot compute on type ${In}.") trait CatalystNumericWithJavaBigDecimal[In, Out] object CatalystNumericWithJavaBigDecimal { private[this] val theInstance = new CatalystNumericWithJavaBigDecimal[Any, Any] {} private[this] def of[In, Out]: CatalystNumericWithJavaBigDecimal[In, Out] = theInstance.asInstanceOf[CatalystNumericWithJavaBigDecimal[In, Out]] implicit val framelessAbsoluteBigDecimal: CatalystNumericWithJavaBigDecimal[BigDecimal, java.math.BigDecimal] = of[BigDecimal, java.math.BigDecimal] implicit val framelessAbsoluteDouble : CatalystNumericWithJavaBigDecimal[Double, Double] = of[Double, Double] implicit val framelessAbsoluteInt : CatalystNumericWithJavaBigDecimal[Int, Int] = of[Int, Int] implicit val framelessAbsoluteLong : CatalystNumericWithJavaBigDecimal[Long, Long] = of[Long, Long] implicit val framelessAbsoluteShort : CatalystNumericWithJavaBigDecimal[Short, Short] = of[Short, Short] implicit val framelessAbsoluteByte : CatalystNumericWithJavaBigDecimal[Byte, Byte] = of[Byte, Byte] } ================================================ FILE: core/src/main/scala/frameless/CatalystOrdered.scala ================================================ package frameless import scala.annotation.implicitNotFound import shapeless.{Generic, HList, Lazy} import shapeless.ops.hlist.LiftAll import java.time.{Duration, Instant, Period} /** Types that can be ordered/compared by Catalyst. */ @implicitNotFound("Cannot compare columns of type ${A}.") trait CatalystOrdered[A] object CatalystOrdered { private[this] val theInstance = new CatalystOrdered[Any] {} private[this] def of[A]: CatalystOrdered[A] = theInstance.asInstanceOf[CatalystOrdered[A]] implicit val framelessIntOrdered : CatalystOrdered[Int] = of[Int] implicit val framelessBooleanOrdered : CatalystOrdered[Boolean] = of[Boolean] implicit val framelessByteOrdered : CatalystOrdered[Byte] = of[Byte] implicit val framelessShortOrdered : CatalystOrdered[Short] = of[Short] implicit val framelessLongOrdered : CatalystOrdered[Long] = of[Long] implicit val framelessFloatOrdered : CatalystOrdered[Float] = of[Float] implicit val framelessDoubleOrdered : CatalystOrdered[Double] = of[Double] implicit val framelessBigDecimalOrdered : CatalystOrdered[BigDecimal] = of[BigDecimal] implicit val framelessSQLDateOrdered : CatalystOrdered[SQLDate] = of[SQLDate] implicit val framelessSQLTimestampOrdered: CatalystOrdered[SQLTimestamp] = of[SQLTimestamp] implicit val framelessStringOrdered : CatalystOrdered[String] = of[String] implicit val framelessInstantOrdered : CatalystOrdered[Instant] = of[Instant] implicit val framelessDurationOrdered : CatalystOrdered[Duration] = of[Duration] implicit val framelessPeriodOrdered : CatalystOrdered[Period] = of[Period] implicit def injectionOrdered[A, B] (implicit i0: Injection[A, B], i1: CatalystOrdered[B] ): CatalystOrdered[A] = of[A] implicit def deriveGeneric[G, H <: HList] (implicit i0: Generic.Aux[G, H], i1: Lazy[LiftAll[CatalystOrdered, H]] ): CatalystOrdered[G] = of[G] } ================================================ FILE: core/src/main/scala/frameless/CatalystPivotable.scala ================================================ package frameless import scala.annotation.implicitNotFound @implicitNotFound("Cannot pivot on type ${A}. Currently supported types to pivot are {Int, Long, Boolean, and String}.") trait CatalystPivotable[A] object CatalystPivotable { private[this] val theInstance = new CatalystPivotable[Any] {} private[this] def of[A]: CatalystPivotable[A] = theInstance.asInstanceOf[CatalystPivotable[A]] implicit val framelessIntPivotable : CatalystPivotable[Int] = of[Int] implicit val framelessLongPivotable : CatalystPivotable[Long] = of[Long] implicit val framelessBooleanPivotable: CatalystPivotable[Boolean] = of[Boolean] implicit val framelessStringPivotable : CatalystPivotable[String] = of[String] } ================================================ FILE: core/src/main/scala/frameless/CatalystRound.scala ================================================ package frameless import scala.annotation.implicitNotFound /** Spark does not return always long on round */ @implicitNotFound("Cannot compute round on type ${In}.") trait CatalystRound[In, Out] object CatalystRound { private[this] val theInstance = new CatalystRound[Any, Any] {} private[this] def of[In, Out]: CatalystRound[In, Out] = theInstance.asInstanceOf[CatalystRound[In, Out]] implicit val framelessBigDecimal: CatalystRound[BigDecimal, java.math.BigDecimal] = of[BigDecimal, java.math.BigDecimal] implicit val framelessDouble : CatalystRound[Double, Long] = of[Double, Long] implicit val framelessInt : CatalystRound[Int, Long] = of[Int, Long] implicit val framelessLong : CatalystRound[Long, Long] = of[Long, Long] implicit val framelessShort : CatalystRound[Short, Long] = of[Short, Long] } ================================================ FILE: core/src/main/scala/frameless/CatalystSummable.scala ================================================ package frameless import scala.annotation.implicitNotFound /** * When summing Spark doesn't change these types: * - Long -> Long * - BigDecimal -> BigDecimal * - Double -> Double * * For other types there are conversions: * - Int -> Long * - Short -> Long */ @implicitNotFound("Cannot compute sum of type ${In}.") trait CatalystSummable[In, Out] { def zero: In } object CatalystSummable { def apply[In, Out](zero: In): CatalystSummable[In, Out] = { val _zero = zero new CatalystSummable[In, Out] { val zero: In = _zero } } implicit val framelessSummableLong : CatalystSummable[Long, Long] = CatalystSummable(zero = 0L) implicit val framelessSummableBigDecimal: CatalystSummable[BigDecimal, BigDecimal] = CatalystSummable(zero = BigDecimal(0)) implicit val framelessSummableDouble : CatalystSummable[Double, Double] = CatalystSummable(zero = 0.0) implicit val framelessSummableInt : CatalystSummable[Int, Long] = CatalystSummable(zero = 0) implicit val framelessSummableShort : CatalystSummable[Short, Long] = CatalystSummable(zero = 0) } ================================================ FILE: core/src/main/scala/frameless/CatalystVariance.scala ================================================ package frameless import scala.annotation.implicitNotFound /** * Spark's variance and stddev functions always return Double */ @implicitNotFound("Cannot compute variance on type ${A}.") trait CatalystVariance[A] object CatalystVariance { private[this] val theInstance = new CatalystVariance[Any] {} private[this] def of[A]: CatalystVariance[A] = theInstance.asInstanceOf[CatalystVariance[A]] implicit val framelessIntVariance : CatalystVariance[Int] = of[Int] implicit val framelessLongVariance : CatalystVariance[Long] = of[Long] implicit val framelessShortVariance : CatalystVariance[Short] = of[Short] implicit val framelessBigDecimalVariance: CatalystVariance[BigDecimal] = of[BigDecimal] implicit val framelessDoubleVariance : CatalystVariance[Double] = of[Double] } ================================================ FILE: core/src/main/scala/frameless/Injection.scala ================================================ package frameless /** * An Injection[A, B] is a reversible function from A to B. * * Must obey `forAll { a: A => invert(apply(a)) == a }`. */ trait Injection[A, B] extends Serializable { def apply(a: A): B def invert(b: B): A } object Injection { def apply[A, B](f: A => B, g: B => A): Injection[A, B] = new Injection[A, B] { def apply(a: A): B = f(a) def invert(b: B): A = g(b) } } ================================================ FILE: core/src/main/scala/frameless/SQLDate.scala ================================================ package frameless /** * Type for the internal Spark representation of SQL date. If the `spark.sql.functions` where typed, * [date_add][1] would for instance be defined as `def date_add(d: SQLDate, i: Int); SQLDate`. * * [1]: https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/functions.html#add_months(org.apache.spark.sql.Column,%20int) */ case class SQLDate(days: Int) ================================================ FILE: core/src/main/scala/frameless/SQLTimestamp.scala ================================================ package frameless /** * Type for the Spark internal representation of a timestamp. If the `spark.sql.functions` where typed, * [current_timestamp][1] would for instance be defined as `def current_timestamp(): SQLTimestamp`. * * [1]: https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html#current_timestamp() */ case class SQLTimestamp(us: Long) ================================================ FILE: dataset/src/main/scala/frameless/FramelessSyntax.scala ================================================ package frameless import org.apache.spark.sql.{Column, DataFrame, Dataset} trait FramelessSyntax { implicit class ColumnSyntax(self: Column) { def typedColumn[T, U: TypedEncoder]: TypedColumn[T, U] = new TypedColumn[T, U](self) def typedAggregate[T, U: TypedEncoder]: TypedAggregate[T, U] = new TypedAggregate[T, U](self) } implicit class DatasetSyntax[T: TypedEncoder](self: Dataset[T]) { def typed: TypedDataset[T] = TypedDataset.create[T](self) } implicit class DataframeSyntax(self: DataFrame){ def unsafeTyped[T: TypedEncoder]: TypedDataset[T] = TypedDataset.createUnsafe(self) } } ================================================ FILE: dataset/src/main/scala/frameless/InjectionEnum.scala ================================================ package frameless import shapeless._ trait InjectionEnum { implicit val cnilInjectionEnum: Injection[CNil, String] = Injection( // $COVERAGE-OFF$No value of type CNil so impossible to test _ => throw new Exception("Impossible"), // $COVERAGE-ON$ name => throw new IllegalArgumentException( s"Cannot construct a value of type CNil: $name did not match data constructor names" ) ) implicit def coproductInjectionEnum[H, T <: Coproduct]( implicit typeable: Typeable[H] , gen: Generic.Aux[H, HNil], tInjectionEnum: Injection[T, String] ): Injection[H :+: T, String] = { val dataConstructorName = typeable.describe.takeWhile(_ != '.') Injection( { case Inl(_) => dataConstructorName case Inr(t) => tInjectionEnum.apply(t) }, { name => if (name == dataConstructorName) Inl(gen.from(HNil)) else Inr(tInjectionEnum.invert(name)) } ) } implicit def genericInjectionEnum[A, R]( implicit gen: Generic.Aux[A, R], rInjectionEnum: Injection[R, String] ): Injection[A, String] = Injection( value => rInjectionEnum(gen.to(value)), name => gen.from(rInjectionEnum.invert(name)) ) } ================================================ FILE: dataset/src/main/scala/frameless/IsValueClass.scala ================================================ package frameless import shapeless._ import shapeless.labelled.FieldType /** Evidence that `T` is a Value class */ @annotation.implicitNotFound(msg = "${T} is not a Value class") final class IsValueClass[T] private() {} object IsValueClass { /** Provides an evidence `A` is a Value class */ implicit def apply[A <: AnyVal, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil]]( implicit i0: LabelledGeneric.Aux[A, G], i1: DropUnitValues.Aux[G, H]): IsValueClass[A] = new IsValueClass[A] } ================================================ FILE: dataset/src/main/scala/frameless/Job.scala ================================================ package frameless import org.apache.spark.sql.SparkSession sealed abstract class Job[A](implicit spark: SparkSession) { self => /** Runs a new Spark job. */ def run(): A def withGroupId(groupId: String): Job[A] = { withLocalProperty("spark.jobGroup.id", groupId) } def withDescription(groupId: String): Job[A] = { withLocalProperty("spark.job.description", groupId) } def withLocalProperty(key: String, value: String): Job[A] = { new Job[A] { def run(): A = { spark.sparkContext.setLocalProperty(key, value) self.run() } } } def map[B](fn: A => B): Job[B] = new Job[B]()(spark) { def run(): B = fn(Job.this.run()) } def flatMap[B](fn: A => Job[B]): Job[B] = new Job[B]()(spark) { def run(): B = fn(Job.this.run()).run() } } object Job { def apply[A](a: => A)(implicit spark: SparkSession): Job[A] = new Job[A] { def run(): A = a } implicit val framelessSparkDelayForJob: SparkDelay[Job] = new SparkDelay[Job] { def delay[A](a: => A)(implicit spark: SparkSession): Job[A] = Job(a) } } ================================================ FILE: dataset/src/main/scala/frameless/RecordEncoder.scala ================================================ package frameless import org.apache.spark.sql.FramelessInternals import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.{ Invoke, NewInstance, UnwrapOption, WrapOption } import org.apache.spark.sql.types._ import shapeless._ import shapeless.labelled.FieldType import shapeless.ops.hlist.IsHCons import shapeless.ops.record.Keys import scala.reflect.ClassTag case class RecordEncoderField( ordinal: Int, name: String, encoder: TypedEncoder[_] ) trait RecordEncoderFields[T <: HList] extends Serializable { def value: List[RecordEncoderField] override def toString: String = s"""RecordEncoderFields${value.mkString("[", ", ", "]")}""" } object RecordEncoderFields { implicit def deriveRecordLast[K <: Symbol, H] (implicit key: Witness.Aux[K], head: RecordFieldEncoder[H] ): RecordEncoderFields[FieldType[K, H] :: HNil] = new RecordEncoderFields[FieldType[K, H] :: HNil] { def value: List[RecordEncoderField] = fieldEncoder[K, H] :: Nil } implicit def deriveRecordCons[K <: Symbol, H, T <: HList] (implicit key: Witness.Aux[K], head: RecordFieldEncoder[H], tail: RecordEncoderFields[T] ): RecordEncoderFields[FieldType[K, H] :: T] = new RecordEncoderFields[FieldType[K, H] :: T] { def value: List[RecordEncoderField] = fieldEncoder[K, H] :: tail.value.map(x => x.copy(ordinal = x.ordinal + 1)) } private def fieldEncoder[K <: Symbol, H](implicit key: Witness.Aux[K], e: RecordFieldEncoder[H]): RecordEncoderField = RecordEncoderField(0, key.value.name, e.encoder) } /** * Assists the generation of constructor call parameters from a labelled generic representation. * As Unit typed fields were removed earlier, we need to put back unit literals in the appropriate positions. * * @tparam T labelled generic representation of type fields */ trait NewInstanceExprs[T <: HList] extends Serializable { def from(exprs: List[Expression]): Seq[Expression] } object NewInstanceExprs { implicit def deriveHNil: NewInstanceExprs[HNil] = new NewInstanceExprs[HNil] { def from(exprs: List[Expression]): Seq[Expression] = Nil } implicit def deriveUnit[K <: Symbol, T <: HList] (implicit tail: NewInstanceExprs[T] ): NewInstanceExprs[FieldType[K, Unit] :: T] = new NewInstanceExprs[FieldType[K, Unit] :: T] { def from(exprs: List[Expression]): Seq[Expression] = Literal.fromObject(()) +: tail.from(exprs) } implicit def deriveNonUnit[K <: Symbol, V, T <: HList] (implicit notUnit: V =:!= Unit, tail: NewInstanceExprs[T] ): NewInstanceExprs[FieldType[K, V] :: T] = new NewInstanceExprs[FieldType[K, V] :: T] { def from(exprs: List[Expression]): Seq[Expression] = exprs.head +: tail.from(exprs.tail) } } /** * Drops fields with Unit type from labelled generic representation of types. * * @tparam L labelled generic representation of type fields */ trait DropUnitValues[L <: HList] extends DepFn1[L] with Serializable { type Out <: HList } object DropUnitValues { def apply[L <: HList](implicit dropUnitValues: DropUnitValues[L]): Aux[L, dropUnitValues.Out] = dropUnitValues type Aux[L <: HList, Out0 <: HList] = DropUnitValues[L] { type Out = Out0 } implicit def deriveHNil[H]: Aux[HNil, HNil] = new DropUnitValues[HNil] { type Out = HNil def apply(l: HNil): Out = HNil } implicit def deriveUnit[K <: Symbol, T <: HList, OutT <: HList] (implicit dropUnitValues : DropUnitValues.Aux[T, OutT] ): Aux[FieldType[K, Unit] :: T, OutT] = new DropUnitValues[FieldType[K, Unit] :: T] { type Out = OutT def apply(l : FieldType[K, Unit] :: T): Out = dropUnitValues(l.tail) } implicit def deriveNonUnit[K <: Symbol, V, T <: HList, OutH, OutT <: HList] (implicit nonUnit: V =:!= Unit, dropUnitValues : DropUnitValues.Aux[T, OutT] ): Aux[FieldType[K, V] :: T, FieldType[K, V] :: OutT] = new DropUnitValues[FieldType[K, V] :: T] { type Out = FieldType[K, V] :: OutT def apply(l : FieldType[K, V] :: T): Out = l.head :: dropUnitValues(l.tail) } } class RecordEncoder[F, G <: HList, H <: HList] (implicit i0: LabelledGeneric.Aux[F, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons[H], fields: Lazy[RecordEncoderFields[H]], newInstanceExprs: Lazy[NewInstanceExprs[G]], classTag: ClassTag[F] ) extends TypedEncoder[F] { def nullable: Boolean = false def jvmRepr: DataType = FramelessInternals.objectTypeFor[F] def catalystRepr: DataType = { val structFields = fields.value.value.map { field => StructField( name = field.name, dataType = field.encoder.catalystRepr, nullable = field.encoder.nullable, metadata = Metadata.empty ) } StructType(structFields) } def toCatalyst(path: Expression): Expression = { val nameExprs = fields.value.value.map { field => Literal(field.name) } val valueExprs = fields.value.value.map { field => val fieldPath = Invoke(path, field.name, field.encoder.jvmRepr, Nil) field.encoder.toCatalyst(fieldPath) } // the way exprs are encoded in CreateNamedStruct val exprs = nameExprs.zip(valueExprs).flatMap { case (nameExpr, valueExpr) => nameExpr :: valueExpr :: Nil } val createExpr = CreateNamedStruct(exprs) val nullExpr = Literal.create(null, createExpr.dataType) If(IsNull(path), nullExpr, createExpr) } def fromCatalyst(path: Expression): Expression = { val exprs = fields.value.value.map { field => field.encoder.fromCatalyst( GetStructField(path, field.ordinal, Some(field.name))) } val newArgs = newInstanceExprs.value.from(exprs) val newExpr = NewInstance( classTag.runtimeClass, newArgs, jvmRepr, propagateNull = true) val nullExpr = Literal.create(null, jvmRepr) If(IsNull(path), nullExpr, newExpr) } } final class RecordFieldEncoder[T]( val encoder: TypedEncoder[T], private[frameless] val jvmRepr: DataType, private[frameless] val fromCatalyst: Expression => Expression, private[frameless] val toCatalyst: Expression => Expression ) extends Serializable object RecordFieldEncoder extends RecordFieldEncoderLowPriority { /** * @tparam F the value class * @tparam G the single field of the value class * @tparam H the single field of the value class (with guarantee it's not a `Unit` value) * @tparam K the key type for the fields * @tparam V the inner value type */ implicit def optionValueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]] (implicit i0: LabelledGeneric.Aux[F, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil], i3: Keys.Aux[H, KS], i4: IsHCons.Aux[KS, K, HNil], i5: TypedEncoder[V], i6: ClassTag[F] ): RecordFieldEncoder[Option[F]] = { val fieldName = i4.head(i3()).name val innerJvmRepr = ObjectType(i6.runtimeClass) val catalyst: Expression => Expression = { path => val value = UnwrapOption(innerJvmRepr, path) val javaValue = Invoke(value, fieldName, i5.jvmRepr, Nil) i5.toCatalyst(javaValue) } val fromCatalyst: Expression => Expression = { path => val javaValue = i5.fromCatalyst(path) val value = NewInstance(i6.runtimeClass, Seq(javaValue), innerJvmRepr) WrapOption(value, innerJvmRepr) } val jvmr = ObjectType(classOf[Option[F]]) new RecordFieldEncoder[Option[F]]( encoder = new TypedEncoder[Option[F]] { val nullable = true val jvmRepr = jvmr @inline def catalystRepr: DataType = i5.catalystRepr def fromCatalyst(path: Expression): Expression = { val javaValue = i5.fromCatalyst(path) val value = NewInstance( i6.runtimeClass, Seq(javaValue), innerJvmRepr) WrapOption(value, innerJvmRepr) } def toCatalyst(path: Expression): Expression = catalyst(path) override def toString: String = s"RecordFieldEncoder.optionValueClass[${i6.runtimeClass.getName}]('${fieldName}', $i5)" }, jvmRepr = jvmr, fromCatalyst = fromCatalyst, toCatalyst = catalyst ) } /** * @tparam F the value class * @tparam G the single field of the value class * @tparam H the single field of the value class (with guarantee it's not a `Unit` value) * @tparam V the inner value type */ implicit def valueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]] (implicit i0: LabelledGeneric.Aux[F, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil], i3: Keys.Aux[H, KS], i4: IsHCons.Aux[KS, K, HNil], i5: TypedEncoder[V], i6: ClassTag[F] ): RecordFieldEncoder[F] = { val cls = i6.runtimeClass val jvmr = i5.jvmRepr val fieldName = i4.head(i3()).name new RecordFieldEncoder[F]( encoder = new TypedEncoder[F] { def nullable = i5.nullable def jvmRepr = jvmr def catalystRepr: DataType = i5.catalystRepr def fromCatalyst(path: Expression): Expression = i5.fromCatalyst(path) @inline def toCatalyst(path: Expression): Expression = i5.toCatalyst(path) override def toString: String = s"RecordFieldEncoder.valueClass[${cls.getName}]('${fieldName}', ${i5})" }, jvmRepr = FramelessInternals.objectTypeFor[F], fromCatalyst = { expr: Expression => NewInstance( i6.runtimeClass, i5.fromCatalyst(expr) :: Nil, ObjectType(i6.runtimeClass)) }, toCatalyst = { expr: Expression => i5.toCatalyst(Invoke(expr, fieldName, jvmr)) } ) } } private[frameless] sealed trait RecordFieldEncoderLowPriority { implicit def apply[T](implicit e: TypedEncoder[T]): RecordFieldEncoder[T] = new RecordFieldEncoder[T](e, e.jvmRepr, e.fromCatalyst, e.toCatalyst) } ================================================ FILE: dataset/src/main/scala/frameless/SparkDelay.scala ================================================ package frameless import org.apache.spark.sql.SparkSession trait SparkDelay[F[_]] { def delay[A](a: => A)(implicit spark: SparkSession): F[A] } ================================================ FILE: dataset/src/main/scala/frameless/TypedColumn.scala ================================================ package frameless import frameless.functions.{litAggr, lit => flit} import frameless.syntax._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.DecimalType import org.apache.spark.sql.{Column, FramelessInternals} import shapeless._ import shapeless.ops.record.Selector import scala.annotation.implicitNotFound import scala.reflect.ClassTag import scala.language.experimental.macros sealed trait UntypedExpression[T] { def expr: Expression def uencoder: TypedEncoder[_] override def toString: String = expr.toString() } /** Expression used in `select`-like constructions. */ sealed class TypedColumn[T, U](expr: Expression)( implicit val uenc: TypedEncoder[U] ) extends AbstractTypedColumn[T, U](expr) { type ThisType[A, B] = TypedColumn[A, B] def this(column: Column)(implicit uencoder: TypedEncoder[U]) = this(FramelessInternals.expr(column)) override def typed[W, U1: TypedEncoder](c: Column): TypedColumn[W, U1] = c.typedColumn override def lit[U1: TypedEncoder](c: U1): TypedColumn[T, U1] = flit(c) } /** Expression used in `agg`-like constructions. */ sealed class TypedAggregate[T, U](expr: Expression)( implicit val uenc: TypedEncoder[U] ) extends AbstractTypedColumn[T, U](expr) { type ThisType[A, B] = TypedAggregate[A, B] def this(column: Column)(implicit uencoder: TypedEncoder[U]) = { this(FramelessInternals.expr(column)) } override def typed[W, U1: TypedEncoder](c: Column): TypedAggregate[W, U1] = c.typedAggregate override def lit[U1: TypedEncoder](c: U1): TypedAggregate[T, U1] = litAggr(c) } /** Generic representation of a typed column. A typed column can either be a [[TypedAggregate]] or * a [[frameless.TypedColumn]]. * * Documentation marked "apache/spark" is thanks to apache/spark Contributors * at https://github.com/apache/spark, licensed under Apache v2.0 available at * http://www.apache.org/licenses/LICENSE-2.0 * * @tparam T phantom type representing the dataset on which this columns is * selected. When `T = A with B` the selection is on either A or B. * @tparam U type of column */ abstract class AbstractTypedColumn[T, U] (val expr: Expression) (implicit val uencoder: TypedEncoder[U]) extends UntypedExpression[T] { self => type ThisType[A, B] <: AbstractTypedColumn[A, B] /** A helper class to make to simplify working with Optional fields. * * {{{ * val x: TypedColumn[Option[Int]] = _ * x.opt.map(_*2) // This only compiles if the type of x is Option[X] (in this example X is of type Int) * }}} * * @note Known issue: map() will NOT work when the applied function is a udf(). * It will compile and then throw a runtime error. **/ trait Mapper[X] { def map[G, OutputType[_,_]](u: ThisType[T, X] => OutputType[T,G]) (implicit ev: OutputType[T,G] <:< AbstractTypedColumn[T, G] ): OutputType[T, Option[G]] = { u(self.asInstanceOf[ThisType[T, X]]).asInstanceOf[OutputType[T, Option[G]]] } } /** Makes it easier to work with Optional columns. It returns an instance of `Mapper[X]` * where `X` is type of the unwrapped Optional. E.g., in the case of `Option[Long]`, * `X` is of type Long. * * {{{ * val x: TypedColumn[Option[Int]] = _ * x.opt.map(_*2) * }}} * */ def opt[X](implicit x: U <:< Option[X]): Mapper[X] = new Mapper[X] {} /** Fall back to an untyped Column */ def untyped: Column = new Column(expr) private def equalsTo[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed { if (uencoder.nullable) EqualNullSafe(self.expr, other.expr) else EqualTo(self.expr, other.expr) } /** Creates a typed column of either TypedColumn or TypedAggregate from an expression. */ protected def typed[W, U1: TypedEncoder](e: Expression): ThisType[W, U1] = typed(new Column(e)) /** Creates a typed column of either TypedColumn or TypedAggregate. */ def typed[W, U1: TypedEncoder](c: Column): ThisType[W, U1] /** Creates a typed column of either TypedColumn or TypedAggregate. */ def lit[U1: TypedEncoder](c: U1): ThisType[T, U1] /** Equality test. * {{{ * df.filter( df.col('a) === 1 ) * }}} * * apache/spark */ def ===(u: U): ThisType[T, Boolean] = equalsTo(lit(u)) /** Equality test. * {{{ * df.filter( df.col('a) === df.col('b) ) * }}} * * apache/spark */ def ===[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = equalsTo(other) /** Inequality test. * * {{{ * df.filter(df.col('a) =!= df.col('b)) * }}} * * apache/spark */ def =!=[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(Not(equalsTo(other).expr)) /** Inequality test. * * {{{ * df.filter(df.col('a) =!= "a") * }}} * * apache/spark */ def =!=(u: U): ThisType[T, Boolean] = typed(Not(equalsTo(lit(u)).expr)) /** True if the current expression is an Option and it's None. * * apache/spark */ def isNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] = typed(IsNull(expr)) /** True if the current expression is an Option and it's not None. * * apache/spark */ def isNotNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] = typed(IsNotNull(expr)) /** True if the current expression is a fractional number and is not NaN. * * apache/spark */ def isNaN(implicit n: CatalystNaN[U]): ThisType[T, Boolean] = typed(self.untyped.isNaN) /** * True if the value for this optional column `exists` as expected * (see `Option.exists`). * * {{{ * df.col('opt).isSome(_ === someOtherCol) * }}} */ def isSome[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, false) /** * True if the value for this optional column `exists` as expected, * or is `None`. (see `Option.forall`). * * {{{ * df.col('opt).isSomeOrNone(_ === someOtherCol) * }}} */ def isSomeOrNone[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, true) private def someOr[V](exists: ThisType[T, V] => ThisType[T, Boolean], default: Boolean)(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = { val defaultExpr = if (default) Literal.TrueLiteral else Literal.FalseLiteral typed(Coalesce(Seq(opt(i0).map(exists).expr, defaultExpr))) } /** Convert an Optional column by providing a default value. * * {{{ * df(df('opt).getOrElse(df('defaultValue))) * }}} */ def getOrElse[TT, W, Out](default: ThisType[TT, Out])(implicit i0: U =:= Option[Out], i1: With.Aux[T, TT, W]): ThisType[W, Out] = typed(Coalesce(Seq(expr, default.expr)))(default.uencoder) /** Convert an Optional column by providing a default value. * * {{{ * df( df('opt).getOrElse(defaultConstant) ) * }}} */ def getOrElse[Out: TypedEncoder](default: Out)(implicit i0: U =:= Option[Out]): ThisType[T, Out] = getOrElse(lit[Out](default)) /** Sum of this expression and another expression. * * {{{ * // The following selects the sum of a person's height and weight. * people.select( people.col('height) plus people.col('weight) ) * }}} * * apache/spark */ def plus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = typed(self.untyped.plus(other.untyped)) /** Sum of this expression and another expression. * {{{ * // The following selects the sum of a person's height and weight. * people.select( people.col('height) + people.col('weight) ) * }}} * * apache/spark */ def +[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = plus(other) /** Sum of this expression (column) with a constant. * {{{ * // The following selects the sum of a person's height and weight. * people.select( people('height) + 2 ) * }}} * * @param u a constant of the same type * apache/spark */ def +(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = typed(self.untyped.plus(u)) /** * Inversion of boolean expression, i.e. NOT. * {{{ * // Select rows that are not active (isActive === false) * df.filter( !df('isActive) ) * }}} * * apache/spark */ def unary_!(implicit i0: U <:< Boolean): ThisType[T, Boolean] = typed(!untyped) /** Unary minus, i.e. negate the expression. * {{{ * // Select the amount column and negates all values. * df.select( -df('amount) ) * }}} * * apache/spark */ def unary_-(implicit n: CatalystNumeric[U]): ThisType[T, U] = typed(-self.untyped) /** Subtraction. Subtract the other expression from this expression. * {{{ * // The following selects the difference between people's height and their weight. * people.select( people.col('height) minus people.col('weight) ) * }}} * * apache/spark */ def minus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = typed(self.untyped.minus(other.untyped)) /** Subtraction. Subtract the other expression from this expression. * {{{ * // The following selects the difference between people's height and their weight. * people.select( people.col('height) - people.col('weight) ) * }}} * * apache/spark */ def -[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = minus(other) /** Subtraction. Subtract the other expression from this expression. * {{{ * // The following selects the difference between people's height and their weight. * people.select( people('height) - 1 ) * }}} * * @param u a constant of the same type * apache/spark */ def -(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = typed(self.untyped.minus(u)) /** Multiplication of this expression and another expression. * {{{ * // The following multiplies a person's height by their weight. * people.select( people.col('height) multiply people.col('weight) ) * }}} * * apache/spark */ def multiply[TT, W] (other: ThisType[TT, U]) (implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W], t: ClassTag[U] ): ThisType[W, U] = typed { if (t.runtimeClass == BigDecimal(0).getClass) { // That's apparently the only way to get sound multiplication. // See https://issues.apache.org/jira/browse/SPARK-22036 val dt = DecimalType(20, 14) self.untyped.cast(dt).multiply(other.untyped.cast(dt)) } else { self.untyped.multiply(other.untyped) } } /** Multiplication of this expression and another expression. * {{{ * // The following multiplies a person's height by their weight. * people.select( people.col('height) * people.col('weight) ) * }}} * * apache/spark */ def *[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W], t: ClassTag[U]): ThisType[W, U] = multiply(other) /** Multiplication of this expression a constant. * {{{ * // The following multiplies a person's height by their weight. * people.select( people.col('height) * people.col('weight) ) * }}} * * apache/spark */ def *(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = typed(self.untyped.multiply(u)) /** Modulo (a.k.a. remainder) expression. * * apache/spark */ def mod[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, Out] = typed(self.untyped.mod(other.untyped)) /** Modulo (a.k.a. remainder) expression. * * apache/spark */ def %[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] = mod(other) /** Modulo (a.k.a. remainder) expression. * * apache/spark */ def %(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] = typed(self.untyped.mod(u)) /** Division this expression by another expression. * {{{ * // The following divides a person's height by their weight. * people.select( people('height) / people('weight) ) * }}} * * @param other another column of the same type * apache/spark */ def divide[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], w: With.Aux[T, TT, W]): ThisType[W, Out] = typed(self.untyped.divide(other.untyped)) /** Division this expression by another expression. * {{{ * // The following divides a person's height by their weight. * people.select( people('height) / people('weight) ) * }}} * * @param other another column of the same type * apache/spark */ def /[Out, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], e: TypedEncoder[Out], w: With.Aux[T, TT, W]): ThisType[W, Out] = divide(other) /** Division this expression by another expression. * {{{ * // The following divides a person's height by their weight. * people.select( people('height) / 2 ) * }}} * * @param u a constant of the same type * apache/spark */ def /(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, Double] = typed(self.untyped.divide(u)) /** Returns a descending ordering used in sorting * * apache/spark */ def desc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](untyped.desc) /** Returns an ascending ordering used in sorting * * apache/spark */ def asc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](untyped.asc) /** Bitwise AND this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseAND (df.col('colB))) * }}} * * @param u a constant of the same type * apache/spark */ def bitwiseAND(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = typed(self.untyped.bitwiseAND(u)) /** Bitwise AND this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseAND (df.col('colB))) * }}} * * @param u a constant of the same type * apache/spark */ def bitwiseAND[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = typed(self.untyped.bitwiseAND(other.untyped)) /** Bitwise AND this expression and another expression (of same type). * {{{ * df.select(df.col('colA).cast[Int] & -1) * }}} * * @param u a constant of the same type * apache/spark */ def &(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = bitwiseAND(u) /** Bitwise AND this expression and another expression. * {{{ * df.select(df.col('colA) & (df.col('colB))) * }}} * * @param other a constant of the same type * apache/spark */ def &[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = bitwiseAND(other) /** Bitwise OR this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseOR (df.col('colB))) * }}} * * @param u a constant of the same type * apache/spark */ def bitwiseOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = typed(self.untyped.bitwiseOR(u)) /** Bitwise OR this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseOR (df.col('colB))) * }}} * * @param other a constant of the same type * apache/spark */ def bitwiseOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = typed(self.untyped.bitwiseOR(other.untyped)) /** Bitwise OR this expression and another expression (of same type). * {{{ * df.select(df.col('colA).cast[Long] | 1L) * }}} * * @param u a constant of the same type * apache/spark */ def |(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = bitwiseOR(u) /** Bitwise OR this expression and another expression. * {{{ * df.select(df.col('colA) | (df.col('colB))) * }}} * * @param other a constant of the same type * apache/spark */ def |[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = bitwiseOR(other) /** Bitwise XOR this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseXOR (df.col('colB))) * }}} * * @param u a constant of the same type * apache/spark */ def bitwiseXOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = typed(self.untyped.bitwiseXOR(u)) /** Bitwise XOR this expression and another expression. * {{{ * df.select(df.col('colA) bitwiseXOR (df.col('colB))) * }}} * * @param other a constant of the same type * apache/spark */ def bitwiseXOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = typed(self.untyped.bitwiseXOR(other.untyped)) /** Bitwise XOR this expression and another expression (of same type). * {{{ * df.select(df.col('colA).cast[Long] ^ 1L) * }}} * * @param u a constant of the same type * apache/spark */ def ^(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] = bitwiseXOR(u) /** Bitwise XOR this expression and another expression. * {{{ * df.select(df.col('colA) ^ (df.col('colB))) * }}} * * @param other a constant of the same type * apache/spark */ def ^[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] = bitwiseXOR(other) /** Casts the column to a different type. * {{{ * df.select(df('a).cast[Int]) * }}} */ def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] = typed(self.untyped.cast(TypedEncoder[A].catalystRepr)) /** * An expression that returns a substring * {{{ * df.select(df('a).substr(0, 5)) * }}} * * @param startPos starting position * @param len length of the substring */ def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] = typed(self.untyped.substr(startPos, len)) /** * An expression that returns a substring * {{{ * df.select(df('a).substr(df('b), df('c))) * }}} * * @param startPos expression for the starting position * @param len expression for the length of the substring */ def substr[TT1, TT2, W1, W2](startPos: ThisType[TT1, Int], len: ThisType[TT2, Int]) (implicit ev: U =:= String, w1: With.Aux[T, TT1, W1], w2: With.Aux[W1, TT2, W2]): ThisType[W2, String] = typed(self.untyped.substr(startPos.untyped, len.untyped)) /** SQL like expression. Returns a boolean column based on a SQL LIKE match. * {{{ * val ds = TypedDataset.create(X2("foo", "bar") :: Nil) * // true * ds.select(ds('a).like("foo")) * * // Selected column has value "bar" * ds.select(when(ds('a).like("f"), ds('a)).otherwise(ds('b)) * }}} * apache/spark */ def like(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] = typed(self.untyped.like(literal)) /** SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match. * {{{ * val ds = TypedDataset.create(X1("foo") :: Nil) * // true * ds.select(ds('a).rlike("foo")) * * // true * ds.select(ds('a).rlike(".*)) * }}} * apache/spark */ def rlike(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] = typed(self.untyped.rlike(literal)) /** String contains another string literal. * {{{ * df.filter ( df.col('a).contains("foo") ) * }}} * * @param other a string that is being tested against. * apache/spark */ def contains(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = typed(self.untyped.contains(other)) /** String contains. * {{{ * df.filter ( df.col('a).contains(df.col('b) ) * }}} * * @param other a column which values is used as a string that is being tested against. * apache/spark */ def contains[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped.contains(other.untyped)) /** String starts with another string literal. * {{{ * df.filter ( df.col('a).startsWith("foo") * }}} * * @param other a prefix that is being tested against. * apache/spark */ def startsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = typed(self.untyped.startsWith(other)) /** String starts with. * {{{ * df.filter ( df.col('a).startsWith(df.col('b)) * }}} * * @param other a column which values is used as a prefix that is being tested against. * apache/spark */ def startsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped.startsWith(other.untyped)) /** String ends with another string literal. * {{{ * df.filter ( df.col('a).endsWith("foo") * }}} * * @param other a suffix that is being tested against. * apache/spark */ def endsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] = typed(self.untyped.endsWith(other)) /** String ends with. * {{{ * df.filter ( df.col('a).endsWith(df.col('b)) * }}} * * @param other a column which values is used as a suffix that is being tested against. * apache/spark */ def endsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped.endsWith(other.untyped)) /** Boolean AND. * {{{ * df.filter ( (df.col('a) === 1).and(df.col('b) > 5) ) * }}} */ def and[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped.and(other.untyped)) /** Boolean AND. * {{{ * df.filter ( df.col('a) === 1 && df.col('b) > 5) * }}} */ def && [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = and(other) /** Boolean OR. * {{{ * df.filter ( (df.col('a) === 1).or(df.col('b) > 5) ) * }}} */ def or[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped.or(other.untyped)) /** Boolean OR. * {{{ * df.filter ( df.col('a) === 1 || df.col('b) > 5) * }}} */ def || [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = or(other) /** Less than. * * {{{ * // The following selects people younger than the maxAge column. * df.select(df('age) < df('maxAge) ) * }}} * * @param other another column of the same type * apache/spark */ def <[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped < other.untyped) /** Less than or equal to. * * {{{ * // The following selects people younger or equal than the maxAge column. * df.select(df('age) <= df('maxAge) * }}} * * @param other another column of the same type * apache/spark */ def <=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped <= other.untyped) /** Greater than. * {{{ * // The following selects people older than the maxAge column. * df.select( df('age) > df('maxAge) ) * }}} * * @param other another column of the same type * apache/spark */ def >[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped > other.untyped) /** Greater than or equal. * {{{ * // The following selects people older or equal than the maxAge column. * df.select( df('age) >= df('maxAge) ) * }}} * * @param other another column of the same type * apache/spark */ def >=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed(self.untyped >= other.untyped) /** Less than. * {{{ * // The following selects people younger than 21. * df.select( df('age) < 21 ) * }}} * * @param u a constant of the same type * apache/spark */ def <(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = typed(self.untyped < lit(u)(self.uencoder).untyped) /** Less than or equal to. * {{{ * // The following selects people younger than 22. * df.select( df('age) <= 2 ) * }}} * * @param u a constant of the same type * apache/spark */ def <=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = typed(self.untyped <= lit(u)(self.uencoder).untyped) /** Greater than. * {{{ * // The following selects people older than 21. * df.select( df('age) > 21 ) * }}} * * @param u another column of the same type * apache/spark */ def >(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = typed(self.untyped > lit(u)(self.uencoder).untyped) /** Greater than or equal. * {{{ * // The following selects people older than 20. * df.select( df('age) >= 21 ) * }}} * * @param u another column of the same type * apache/spark */ def >=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = typed(self.untyped >= lit(u)(self.uencoder).untyped) /** * Returns true if the value of this column is contained in of the arguments. * {{{ * // The following selects people with age 15, 20, or 30. * df.select( df('age).isin(15, 20, 30) ) * }}} * * @param values are constants of the same type * apache/spark */ def isin(values: U*)(implicit e: CatalystIsin[U]): ThisType[T, Boolean] = typed(self.untyped.isin(values:_*)) /** * True if the current column is between the lower bound and upper bound, inclusive. * * @param lowerBound a constant of the same type * @param upperBound a constant of the same type * apache/spark */ def between(lowerBound: U, upperBound: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] = typed(self.untyped.between(lit(lowerBound)(self.uencoder).untyped, lit(upperBound)(self.uencoder).untyped)) /** * True if the current column is between the lower bound and upper bound, inclusive. * * @param lowerBound another column of the same type * @param upperBound another column of the same type * apache/spark */ def between[TT1, TT2, W1, W2](lowerBound: ThisType[TT1, U], upperBound: ThisType[TT2, U]) (implicit i0: CatalystOrdered[U], w0: With.Aux[T, TT1, W1], w1: With.Aux[TT2, W1, W2] ): ThisType[W2, Boolean] = typed(self.untyped.between(lowerBound.untyped, upperBound.untyped)) /** * Returns a nested column matching the field `symbol`. * * @param symbol the field symbol * @tparam V the type of the nested field */ def field[V](symbol: Witness.Lt[Symbol])(implicit i0: TypedColumn.Exists[U, symbol.T, V], i1: TypedEncoder[V] ): ThisType[T, V] = typed(self.untyped.getField(symbol.value.name)) } sealed class SortedTypedColumn[T, U](val expr: Expression)( implicit val uencoder: TypedEncoder[U] ) extends UntypedExpression[T] { def this(column: Column)(implicit e: TypedEncoder[U]) = { this(FramelessInternals.expr(column)) } def untyped: Column = new Column(expr) } object SortedTypedColumn { implicit def defaultAscending[T, U : CatalystOrdered](typedColumn: TypedColumn[T, U]): SortedTypedColumn[T, U] = new SortedTypedColumn[T, U](typedColumn.untyped.asc)(typedColumn.uencoder) object defaultAscendingPoly extends Poly1 { implicit def caseTypedColumn[T, U : CatalystOrdered] = at[TypedColumn[T, U]](c => defaultAscending(c)) implicit def caseTypeSortedColumn[T, U] = at[SortedTypedColumn[T, U]](identity) } } object TypedColumn { /** Evidence that type `T` has column `K` with type `V`. */ @implicitNotFound(msg = "No column ${K} of type ${V} in ${T}") trait Exists[T, K, V] @implicitNotFound(msg = "No columns ${K} of type ${V} in ${T}") trait ExistsMany[T, K <: HList, V] object ExistsMany { implicit def deriveCons[T, KH, KT <: HList, V0, V1] (implicit head: Exists[T, KH, V0], tail: ExistsMany[V0, KT, V1] ): ExistsMany[T, KH :: KT, V1] = new ExistsMany[T, KH :: KT, V1] {} implicit def deriveHNil[T, K, V](implicit head: Exists[T, K, V]): ExistsMany[T, K :: HNil, V] = new ExistsMany[T, K :: HNil, V] {} } object Exists { def apply[T, V](column: Witness)(implicit e: Exists[T, column.T, V]): Exists[T, column.T, V] = e implicit def deriveRecord[T, H <: HList, K, V] (implicit i0: LabelledGeneric.Aux[T, H], i1: Selector.Aux[H, K, V] ): Exists[T, K, V] = new Exists[T, K, V] {} } /** * {{{ * import frameless.TypedColumn * * case class Foo(id: Int, bar: String) * * val colbar: TypedColumn[Foo, String] = TypedColumn { foo: Foo => foo.bar } * val colid = TypedColumn[Foo, Int](_.id) * }}} */ def apply[T, U](x: T => U): TypedColumn[T, U] = macro TypedColumnMacroImpl.applyImpl[T, U] } ================================================ FILE: dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala ================================================ package frameless import scala.reflect.macros.whitebox private[frameless] object TypedColumnMacroImpl { def applyImpl[T: c.WeakTypeTag, U: c.WeakTypeTag](c: whitebox.Context)(x: c.Tree): c.Expr[TypedColumn[T, U]] = { import c.universe._ val t = c.weakTypeOf[T] val u = c.weakTypeOf[U] def buildExpression(path: List[String]): c.Expr[TypedColumn[T, U]] = { val columnName = path.mkString(".") c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($columnName)).expr)") } def abort(msg: String) = c.abort(c.enclosingPosition, msg) @annotation.tailrec def path(in: Select, out: List[TermName]): List[TermName] = in.qualifier match { case sub: Select => path(sub, in.name.toTermName :: out) case id: Ident => id.name.toTermName :: in.name.toTermName :: out case u => abort(s"Unsupported selection: $u") } @annotation.tailrec def check(current: Type, in: List[TermName]): Boolean = in match { case next :: tail => { val sym = current.decl(next).asTerm if (!sym.isStable) { abort(s"Stable term expected: ${current}.${next}") } check(sym.info, tail) } case _ => true } x match { case fn: Function => fn.body match { case select: Select if select.name.isTermName => val expectedRoot: Option[String] = fn.vparams match { case List(rt) if rt.rhs == EmptyTree => Option.empty[String] case List(rt) => Some(rt.toString) case u => abort(s"Select expression must have a single parameter: ${u mkString ", "}") } path(select, List.empty) match { case root :: tail if ( expectedRoot.forall(_ == root) && check(t, tail)) => { val colPath = tail.mkString(".") c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($colPath)).expr)") } case _ => abort(s"Invalid select expression: $select") } case t => abort(s"Select expression expected: $t") } case _ => abort(s"Function expected: $x") } } } ================================================ FILE: dataset/src/main/scala/frameless/TypedDataset.scala ================================================ package frameless import java.util import frameless.functions.CatalystExplodableCollection import frameless.ops._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Dataset, FramelessInternals, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal} import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint} import org.apache.spark.sql.catalyst.plans.Inner import org.apache.spark.sql.types.StructType import shapeless._ import shapeless.labelled.FieldType import shapeless.ops.hlist.{Diff, IsHCons, Mapper, Prepend, ToTraversable, Tupler} import shapeless.ops.record.{Keys, Modifier, Remover, Values} import scala.language.experimental.macros /** [[TypedDataset]] is a safer interface for working with `Dataset`. * * NOTE: Prefer `TypedDataset.create` over `new TypedDataset` unless you * know what you are doing. * * Documentation marked "apache/spark" is thanks to apache/spark Contributors * at https://github.com/apache/spark, licensed under Apache v2.0 available at * http://www.apache.org/licenses/LICENSE-2.0 */ class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val encoder: TypedEncoder[T]) extends TypedDatasetForwarded[T] { self => private implicit val spark: SparkSession = dataset.sparkSession /** Aggregates on the entire Dataset without groups. * * apache/spark */ def agg[A](ca: TypedAggregate[T, A]): TypedDataset[A] = { implicit val ea = ca.uencoder val tuple1: TypedDataset[Tuple1[A]] = aggMany(ca) // now we need to unpack `Tuple1[A]` to `A` TypedEncoder[A].catalystRepr match { case StructType(_) => // if column is struct, we use all its fields val df = tuple1 .dataset .selectExpr("_1.*") .as[A](TypedExpressionEncoder[A]) TypedDataset.create(df) case other => // for primitive types `Tuple1[A]` has the same schema as `A` TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A])) } } /** Aggregates on the entire Dataset without groups. * * apache/spark */ def agg[A, B]( ca: TypedAggregate[T, A], cb: TypedAggregate[T, B] ): TypedDataset[(A, B)] = { implicit val (ea, eb) = (ca.uencoder, cb.uencoder) aggMany(ca, cb) } /** Aggregates on the entire Dataset without groups. * * apache/spark */ def agg[A, B, C]( ca: TypedAggregate[T, A], cb: TypedAggregate[T, B], cc: TypedAggregate[T, C] ): TypedDataset[(A, B, C)] = { implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder) aggMany(ca, cb, cc) } /** Aggregates on the entire Dataset without groups. * * apache/spark */ def agg[A, B, C, D]( ca: TypedAggregate[T, A], cb: TypedAggregate[T, B], cc: TypedAggregate[T, C], cd: TypedAggregate[T, D] ): TypedDataset[(A, B, C, D)] = { implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) aggMany(ca, cb, cc, cd) } /** Aggregates on the entire Dataset without groups. * * apache/spark */ object aggMany extends ProductArgs { def applyProduct[U <: HList, Out0 <: HList, Out](columns: U) (implicit i0: AggregateTypes.Aux[T, U, Out0], i1: ToTraversable.Aux[U, List, UntypedExpression[T]], i2: Tupler.Aux[Out0, Out], i3: TypedEncoder[Out] ): TypedDataset[Out] = { val underlyingColumns = columns.toList[UntypedExpression[T]] val cols: Seq[Column] = for { (c, i) <- columns.toList[UntypedExpression[T]].zipWithIndex } yield new Column(c.expr).as(s"_${i+1}") // Workaround to SPARK-20346. One alternative is to allow the result to be Vector(null) for empty DataFrames. // Another one would be to return an Option. val filterStr = ( for { (c, i) <- underlyingColumns.zipWithIndex if !c.uencoder.nullable } yield s"_${i+1} is not null" ).mkString(" or ") val selected = dataset.toDF().agg(cols.head, cols.tail:_*).as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](if (filterStr.isEmpty) selected else selected.filter(filterStr)) } } /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. */ def as[U]()(implicit as: As[T, U]): TypedDataset[U] = { implicit val uencoder = as.encoder TypedDataset.create(dataset.as[U](TypedExpressionEncoder[U])) } /** Returns a checkpointed version of this [[TypedDataset]]. Checkpointing can be used to truncate the * logical plan of this Dataset, which is especially useful in iterative algorithms where the * plan may grow exponentially. It will be saved to files inside the checkpoint * directory set with `SparkContext#setCheckpointDir`. * * Differs from `Dataset#checkpoint` by wrapping its result into an effect-suspending `F[_]`. * * apache/spark */ def checkpoint[F[_]](eager: Boolean)(implicit F: SparkDelay[F]): F[TypedDataset[T]] = F.delay(TypedDataset.create[T](dataset.checkpoint(eager))) /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. * Unlike `as` the projection U may include a subset of the columns of T and the column names and types must agree. * * {{{ * case class Foo(i: Int, j: String) * case class Bar(j: String) * * val t: TypedDataset[Foo] = ... * val b: TypedDataset[Bar] = t.project[Bar] * * case class BarErr(e: String) * // The following does not compile because `Foo` doesn't have a field with name `e` * val e: TypedDataset[BarErr] = t.project[BarErr] * }}} */ def project[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = projector.apply(this) /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] * combined. * * Note that, this function is not a typical set union operation, in that it does not eliminate * duplicate items. As such, it is analogous to `UNION ALL` in SQL. * * Differs from `Dataset#union` by aligning fields if possible. * It will not compile if `Datasets` have not compatible schema. * * Example: * {{{ * case class Foo(x: Int, y: Long) * case class Bar(y: Long, x: Int) * case class Faz(x: Int, y: Int, z: Int) * * foo: TypedDataset[Foo] = ... * bar: TypedDataset[Bar] = ... * faz: TypedDataset[Faz] = ... * * foo union bar: TypedDataset[Foo] * foo union faz: TypedDataset[Foo] * // won't compile, you need to reverse order, you can't project from less fields to more * faz union foo * * }}} * * apache/spark */ def union[U: TypedEncoder](other: TypedDataset[U])(implicit projector: SmartProject[U, T]): TypedDataset[T] = TypedDataset.create(dataset.union(other.project[T].dataset)) /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]] * combined. * * Note that, this function is not a typical set union operation, in that it does not eliminate * duplicate items. As such, it is analogous to `UNION ALL` in SQL. * * apache/spark */ def union(other: TypedDataset[T]): TypedDataset[T] = { TypedDataset.create(dataset.union(other.dataset)) } /** Returns the number of elements in the [[TypedDataset]]. * * Differs from `Dataset#count` by wrapping its result into an effect-suspending `F[_]`. */ def count[F[_]]()(implicit F: SparkDelay[F]): F[Long] = F.delay(dataset.count()) /** Returns `TypedColumn` of type `A` given its name (alias for `col`). * * {{{ * tf('id) * }}} * * It is statically checked that column with such name exists and has type `A`. */ def apply[A](column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = col(column) /** Returns `TypedColumn` of type `A` given its name. * * {{{ * tf.col('id) * }}} * * It is statically checked that column with such name exists and has type `A`. */ def col[A](column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = new TypedColumn[T, A](dataset(column.value.name).as[A](TypedExpressionEncoder[A])) /** Returns `TypedColumn` of type `A` given a lambda indicating the field. * * {{{ * td.col(_.id) * }}} * * It is statically checked that column with such name exists and has type `A`. */ def col[A](x: Function1[T, A]): TypedColumn[T, A] = macro TypedColumnMacroImpl.applyImpl[T, A] /** Projects the entire `TypedDataset[T]` into a single column of type `TypedColumn[T,T]`. * {{{ * ts: TypedDataset[Foo] = ... * ts.select(ts.asCol, ts.asCol): TypedDataset[(Foo,Foo)] * }}} */ def asCol: TypedColumn[T, T] = { val projectedColumn: Column = encoder.catalystRepr match { case StructType(_) => val allColumns: Array[Column] = dataset.columns.map(dataset.col) org.apache.spark.sql.functions.struct(allColumns.toSeq: _*) case _ => dataset.col(dataset.columns.head) } new TypedColumn[T,T](projectedColumn) } /** References the entire `TypedDataset[T]` as a single column * of type `TypedColumn[T,T]` so it can be used in a join operation. * * {{{ * def nameJoin(ds1: TypedDataset[Person], ds2: TypedDataset[Name]) = * ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue) * }}} */ def asJoinColValue(implicit i0: IsValueClass[T]): TypedColumn[T, T] = { import _root_.frameless.syntax._ dataset.col("value").typedColumn } object colMany extends SingletonProductArgs { def applyProduct[U <: HList, Out](columns: U) (implicit i0: TypedColumn.ExistsMany[T, U, Out], i1: TypedEncoder[Out], i2: ToTraversable.Aux[U, List, Symbol] ): TypedColumn[T, Out] = { val names = columns.toList[Symbol].map(_.name) val colExpr = FramelessInternals.resolveExpr(dataset, names) new TypedColumn[T, Out](colExpr) } } /** Right hand side disambiguation of `col` for join expressions. * To be used when writting self-joins, noop in other circumstances. * * Note: In vanilla Spark, disambiguation in self-joins is acheaved using * String based aliases, which is obviously unsafe. */ def colRight[A](column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = new TypedColumn[T, A](FramelessInternals.DisambiguateRight(col(column).expr)) /** Left hand side disambiguation of `col` for join expressions. * To be used when writting self-joins, noop in other circumstances. * * Note: In vanilla Spark, disambiguation in self-joins is acheaved using * String based aliases, which is obviously unsafe. */ def colLeft[A](column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, A], i1: TypedEncoder[A] ): TypedColumn[T, A] = new TypedColumn[T, A](FramelessInternals.DisambiguateLeft(col(column).expr)) /** Returns a `Seq` that contains all the elements in this [[TypedDataset]]. * * Running this operation requires moving all the data into the application's driver process, and * doing so on a very large [[TypedDataset]] can crash the driver process with OutOfMemoryError. * * Differs from `Dataset#collect` by wrapping its result into an effect-suspending `F[_]`. */ def collect[F[_]]()(implicit F: SparkDelay[F]): F[Seq[T]] = F.delay(dataset.collect().toSeq) /** Optionally returns the first element in this [[TypedDataset]]. * * Differs from `Dataset#first` by wrapping its result into an `Option` and an effect-suspending `F[_]`. */ def firstOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = F.delay { try { Option(dataset.first()) } catch { case e: NoSuchElementException => None } } /** Returns the first `num` elements of this [[TypedDataset]] as a `Seq`. * * Running take requires moving data into the application's driver process, and doing so with * a very large `num` can crash the driver process with OutOfMemoryError. * * Differs from `Dataset#take` by wrapping its result into an effect-suspending `F[_]`. * * apache/spark */ def take[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = F.delay(dataset.take(num).toSeq) /** Return an iterator that contains all rows in this [[TypedDataset]]. * * The iterator will consume as much memory as the largest partition in this [[TypedDataset]]. * * NOTE: this results in multiple Spark jobs, and if the input [[TypedDataset]] is the result * of a wide transformation (e.g. join with different partitioners), to avoid * recomputing the input [[TypedDataset]] should be cached first. * * Differs from `Dataset#toLocalIterator()` by wrapping its result into an effect-suspending `F[_]`. * * apache/spark */ def toLocalIterator[F[_]]()(implicit F: SparkDelay[F]): F[util.Iterator[T]] = F.delay(dataset.toLocalIterator()) /** Alias for firstOption(). */ def headOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = firstOption() /** Alias for take(). */ def head[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = take(num) // $COVERAGE-OFF$ /** Alias for firstOption(). */ @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0") def head: T = dataset.head() /** Alias for firstOption(). */ @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0") def first: T = dataset.head() // $COVERAGE-ONN$ /** Displays the content of this [[TypedDataset]] in a tabular form. Strings more than 20 characters * will be truncated, and all cells will be aligned right. For example: * {{{ * year month AVG('Adj Close) MAX('Adj Close) * 1980 12 0.503218 0.595103 * 1981 01 0.523289 0.570307 * 1982 02 0.436504 0.475256 * 1983 03 0.410516 0.442194 * 1984 04 0.450090 0.483521 * }}} * @param numRows Number of rows to show * @param truncate Whether truncate long strings. If true, strings more than 20 characters will * be truncated and all cells will be aligned right * * Differs from `Dataset#show` by wrapping its result into an effect-suspending `F[_]`. * * apache/spark */ def show[F[_]](numRows: Int = 20, truncate: Boolean = true)(implicit F: SparkDelay[F]): F[Unit] = F.delay(dataset.show(numRows, truncate)) /** Returns a new [[frameless.TypedDataset]] that only contains elements where `column` is `true`. * * Differs from `TypedDatasetForward#filter` by taking a `TypedColumn[T, Boolean]` instead of a * `T => Boolean`. Using a column expression instead of a regular function save one Spark → Scala * deserialization which leads to better performance. */ def filter(column: TypedColumn[T, Boolean]): TypedDataset[T] = { val filtered = dataset.toDF() .filter(column.untyped) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](filtered) } /** Runs `func` on each element of this [[TypedDataset]]. * * Differs from `Dataset#foreach` by wrapping its result into an effect-suspending `F[_]`. */ def foreach[F[_]](func: T => Unit)(implicit F: SparkDelay[F]): F[Unit] = F.delay(dataset.foreach(func)) /** Runs `func` on each partition of this [[TypedDataset]]. * * Differs from `Dataset#foreachPartition` by wrapping its result into an effect-suspending `F[_]`. */ def foreachPartition[F[_]](func: Iterator[T] => Unit)(implicit F: SparkDelay[F]): F[Unit] = F.delay(dataset.foreachPartition(func)) /** * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified column, * so we can run aggregation on it. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ def cube[K1]( c1: TypedColumn[T, K1] ): Cube1Ops[K1, T] = new Cube1Ops[K1, T](this, c1) /** * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, * so we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ def cube[K1, K2]( c1: TypedColumn[T, K1], c2: TypedColumn[T, K2] ): Cube2Ops[K1, K2, T] = new Cube2Ops[K1, K2, T](this, c1, c2) /** * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns, * so we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * {{{ * case class MyClass(a: Int, b: Int, c: Int) * val ds: TypedDataset[MyClass] * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = * ds.cubeMany(ds('a), ds('b)).agg(count[MyClass]()) * * // original dataset: * a b c * 10 20 1 * 15 25 2 * * // after aggregation: * _1 _2 _3 * 15 null 1 * 15 25 1 * null null 2 * null 25 1 * null 20 1 * 10 null 1 * 10 20 1 * * }}} * * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ object cubeMany extends ProductArgs { def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] ): CubeManyOps[T, TK, K, KT] = new CubeManyOps[T, TK, K, KT](self, groupedBy) } /** * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * apache/spark */ def groupBy[K1]( c1: TypedColumn[T, K1] ): GroupedBy1Ops[K1, T] = new GroupedBy1Ops[K1, T](this, c1) /** * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * apache/spark */ def groupBy[K1, K2]( c1: TypedColumn[T, K1], c2: TypedColumn[T, K2] ): GroupedBy2Ops[K1, K2, T] = new GroupedBy2Ops[K1, K2, T](this, c1, c2) /** * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * {{{ * case class MyClass(a: Int, b: Int, c: Int) * val ds: TypedDataset[MyClass] * * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = * ds.groupByMany(ds('a), ds('b)).agg(count[MyClass]()) * * // original dataset: * a b c * 10 20 1 * 15 25 2 * * // after aggregation: * _1 _2 _3 * 10 20 1 * 15 25 1 * * }}} * * apache/spark */ object groupByMany extends ProductArgs { def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] ): GroupedByManyOps[T, TK, K, KT] = new GroupedByManyOps[T, TK, K, KT](self, groupedBy) } /** * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified column, * so we can run aggregation on it. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ def rollup[K1]( c1: TypedColumn[T, K1] ): Rollup1Ops[K1, T] = new Rollup1Ops[K1, T](this, c1) /** * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, * so we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ def rollup[K1, K2]( c1: TypedColumn[T, K1], c2: TypedColumn[T, K2] ): Rollup2Ops[K1, K2, T] = new Rollup2Ops[K1, K2, T](this, c1, c2) /** * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns, * so we can run aggregation on them. * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions. * * {{{ * case class MyClass(a: Int, b: Int, c: Int) * val ds: TypedDataset[MyClass] * * val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] = * ds.rollupMany(ds('a), ds('b)).agg(count[MyClass]()) * * // original dataset: * a b c * 10 20 1 * 15 25 2 * * // after aggregation: * _1 _2 _3 * 15 null 1 * 15 25 1 * null null 2 * 10 null 1 * 10 20 1 * * }}} * * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`. * * apache/spark */ object rollupMany extends ProductArgs { def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: Tupler.Aux[K, KT], i2: ToTraversable.Aux[TK, List, UntypedExpression[T]] ): RollupManyOps[T, TK, K, KT] = new RollupManyOps[T, TK, K, KT](self, groupedBy) } /** Computes the cartesian project of `this` `Dataset` with the `other` `Dataset` */ def joinCross[U](other: TypedDataset[U]) (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = new TypedDataset(self.dataset.joinWith(other.dataset, new Column(Literal(true)), "cross")) /** Computes the full outer join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinFull[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) (implicit e: TypedEncoder[(Option[T], Option[U])]): TypedDataset[(Option[T], Option[U])] = new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "full") .as[(Option[T], Option[U])](TypedExpressionEncoder[(Option[T], Option[U])])) /** Computes the inner join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinInner[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = { import FramelessInternals._ val leftPlan = logicalPlan(dataset) val rightPlan = logicalPlan(other.dataset) val join = disambiguate(Join(leftPlan, rightPlan, Inner, Some(condition.expr), JoinHint.NONE)) val joinedPlan = joinPlan(dataset, join, leftPlan, rightPlan) val joinedDs = mkDataset(dataset.sqlContext, joinedPlan, TypedExpressionEncoder[(T, U)]) TypedDataset.create[(T, U)](joinedDs) } /** Computes the left outer join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinLeft[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) (implicit e: TypedEncoder[(T, Option[U])]): TypedDataset[(T, Option[U])] = new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "left_outer") .as[(T, Option[U])](TypedExpressionEncoder[(T, Option[U])])) /** Computes the left semi join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinLeftSemi[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] = new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftsemi") .as[T](TypedExpressionEncoder(encoder))) /** Computes the left anti join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinLeftAnti[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] = new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftanti") .as[T](TypedExpressionEncoder(encoder))) /** Computes the right outer join of `this` `Dataset` with the `other` `Dataset`, * returning a `Tuple2` for each pair where condition evaluates to true. */ def joinRight[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]) (implicit e: TypedEncoder[(Option[T], U)]): TypedDataset[(Option[T], U)] = new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "right_outer") .as[(Option[T], U)](TypedExpressionEncoder[(Option[T], U)])) private def disambiguate(join: Join): Join = { val plan = FramelessInternals.ofRows(dataset.sparkSession, join).queryExecution.analyzed.asInstanceOf[Join] val disambiguated = plan.condition.map(_.transform { case FramelessInternals.DisambiguateLeft(tagged: AttributeReference) => val leftDs = FramelessInternals.ofRows(spark, plan.left) FramelessInternals.resolveExpr(leftDs, Seq(tagged.name)) case FramelessInternals.DisambiguateRight(tagged: AttributeReference) => val rightDs = FramelessInternals.ofRows(spark, plan.right) FramelessInternals.resolveExpr(rightDs, Seq(tagged.name)) case x => x }) plan.copy(condition = disambiguated) } /** Takes a function from A => R and converts it to a UDF for TypedColumn[T, A] => TypedColumn[T, R]. */ def makeUDF[A: TypedEncoder, R: TypedEncoder](f: A => R): TypedColumn[T, A] => TypedColumn[T, R] = functions.udf(f) /** Takes a function from (A1, A2) => R and converts it to a UDF for * (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R]. */ def makeUDF[A1: TypedEncoder, A2: TypedEncoder, R: TypedEncoder](f: (A1, A2) => R): (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = functions.udf(f) /** Takes a function from (A1, A2, A3) => R and converts it to a UDF for * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R]. */ def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = functions.udf(f) /** Takes a function from (A1, A2, A3, A4) => R and converts it to a UDF for * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R]. */ def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = functions.udf(f) /** Takes a function from (A1, A2, A3, A4, A5) => R and converts it to a UDF for * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R]. */ def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, A5: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4, A5) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = functions.udf(f) /** Type-safe projection from type T to Tuple1[A] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A]( ca: TypedColumn[T, A] ): TypedDataset[A] = { implicit val ea = ca.uencoder val tuple1: TypedDataset[Tuple1[A]] = selectMany(ca) // now we need to unpack `Tuple1[A]` to `A` TypedEncoder[A].catalystRepr match { case StructType(_) => // if column is struct, we use all its fields val df = tuple1 .dataset .selectExpr("_1.*") .as[A](TypedExpressionEncoder[A]) TypedDataset.create(df) case other => // for primitive types `Tuple1[A]` has the same schema as `A` TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A])) } } /** Type-safe projection from type T to Tuple2[A,B] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B]( ca: TypedColumn[T, A], cb: TypedColumn[T, B] ): TypedDataset[(A, B)] = { implicit val (ea, eb) = (ca.uencoder, cb.uencoder) selectMany(ca, cb) } /** Type-safe projection from type T to Tuple3[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C] ): TypedDataset[(A, B, C)] = { implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder) selectMany(ca, cb, cc) } /** Type-safe projection from type T to Tuple4[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D] ): TypedDataset[(A, B, C, D)] = { implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder) selectMany(ca, cb, cc, cd) } /** Type-safe projection from type T to Tuple5[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E] ): TypedDataset[(A, B, C, D, E)] = { implicit val (ea, eb, ec, ed, ee) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder) selectMany(ca, cb, cc, cd, ce) } /** Type-safe projection from type T to Tuple6[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E, F]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E], cf: TypedColumn[T, F] ): TypedDataset[(A, B, C, D, E, F)] = { implicit val (ea, eb, ec, ed, ee, ef) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder) selectMany(ca, cb, cc, cd, ce, cf) } /** Type-safe projection from type T to Tuple7[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E, F, G]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E], cf: TypedColumn[T, F], cg: TypedColumn[T, G] ): TypedDataset[(A, B, C, D, E, F, G)] = { implicit val (ea, eb, ec, ed, ee, ef, eg) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder) selectMany(ca, cb, cc, cd, ce, cf, cg) } /** Type-safe projection from type T to Tuple8[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E, F, G, H]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E], cf: TypedColumn[T, F], cg: TypedColumn[T, G], ch: TypedColumn[T, H] ): TypedDataset[(A, B, C, D, E, F, G, H)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder) selectMany(ca, cb, cc, cd, ce, cf, cg, ch) } /** Type-safe projection from type T to Tuple9[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E, F, G, H, I]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E], cf: TypedColumn[T, F], cg: TypedColumn[T, G], ch: TypedColumn[T, H], ci: TypedColumn[T, I] ): TypedDataset[(A, B, C, D, E, F, G, H, I)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder) selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci) } /** Type-safe projection from type T to Tuple10[A,B,...] * {{{ * d.select( d('a), d('a)+d('b), ... ) * }}} */ def select[A, B, C, D, E, F, G, H, I, J]( ca: TypedColumn[T, A], cb: TypedColumn[T, B], cc: TypedColumn[T, C], cd: TypedColumn[T, D], ce: TypedColumn[T, E], cf: TypedColumn[T, F], cg: TypedColumn[T, G], ch: TypedColumn[T, H], ci: TypedColumn[T, I], cj: TypedColumn[T, J] ): TypedDataset[(A, B, C, D, E, F, G, H, I, J)] = { implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei, ej) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder, cj.uencoder) selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci, cj) } object selectMany extends ProductArgs { def applyProduct[U <: HList, Out0 <: HList, Out](columns: U) (implicit i0: ColumnTypes.Aux[T, U, Out0], i1: ToTraversable.Aux[U, List, UntypedExpression[T]], i2: Tupler.Aux[Out0, Out], i3: TypedEncoder[Out] ): TypedDataset[Out] = { val base = dataset.toDF() .select(columns.toList[UntypedExpression[T]].map(c => new Column(c.expr)):_*) val selected = base.as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](selected) } } /** Sort each partition in the dataset using the columns selected. */ def sortWithinPartitions[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] = sortWithinPartitionsMany(ca) /** Sort each partition in the dataset using the columns selected. */ def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered]( ca: SortedTypedColumn[T, A], cb: SortedTypedColumn[T, B] ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb) /** Sort each partition in the dataset using the columns selected. */ def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered]( ca: SortedTypedColumn[T, A], cb: SortedTypedColumn[T, B], cc: SortedTypedColumn[T, C] ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb, cc) /** Sort each partition in the dataset by the given column expressions * Default sort order is ascending. * {{{ * d.sortWithinPartitionsMany(d('a), d('b).desc, d('c).asc) * }}} */ object sortWithinPartitionsMany extends ProductArgs { def applyProduct[U <: HList, O <: HList](columns: U) (implicit i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O], i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]] ): TypedDataset[T] = { val sorted = dataset.toDF() .sortWithinPartitions(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](sorted) } } /** Orders the TypedDataset using the column selected. */ def orderBy[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] = orderByMany(ca) /** Orders the TypedDataset using the columns selected. */ def orderBy[A: CatalystOrdered, B: CatalystOrdered]( ca: SortedTypedColumn[T, A], cb: SortedTypedColumn[T, B] ): TypedDataset[T] = orderByMany(ca, cb) /** Orders the TypedDataset using the columns selected. */ def orderBy[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered]( ca: SortedTypedColumn[T, A], cb: SortedTypedColumn[T, B], cc: SortedTypedColumn[T, C] ): TypedDataset[T] = orderByMany(ca, cb, cc) /** Sort the dataset by any number of column expressions. * Default sort order is ascending. * {{{ * d.orderByMany(d('a), d('b).desc, d('c).asc) * }}} */ object orderByMany extends ProductArgs { def applyProduct[U <: HList, O <: HList](columns: U) (implicit i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O], i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]] ): TypedDataset[T] = { val sorted = dataset.toDF() .orderBy(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](sorted) } } /** Returns a new Dataset as a tuple with the specified * column dropped. * Does not allow for dropping from a single column TypedDataset * * {{{ * val d: TypedDataset[Foo(a: String, b: Int...)] = ??? * val result = TypedDataset[(Int, ...)] = d.drop('a) * }}} * @param column column to drop specified as a Symbol * @param i0 LabelledGeneric derived for T * @param i1 Remover derived for TRep and column * @param i2 values of T with column removed * @param i3 tupler of values * @param i4 evidence of encoder of the tupled values * @tparam Out Tupled return type * @tparam TRep shapeless' record representation of T * @tparam Removed record of T with column removed * @tparam ValuesFromRemoved values of T with column removed as an HList * @tparam V value type of column in T * @return */ def dropTupled[Out, TRep <: HList, Removed <: HList, ValuesFromRemoved <: HList, V] (column: Witness.Lt[Symbol]) (implicit i0: LabelledGeneric.Aux[T, TRep], i1: Remover.Aux[TRep, column.T, (V, Removed)], i2: Values.Aux[Removed, ValuesFromRemoved], i3: Tupler.Aux[ValuesFromRemoved, Out], i4: TypedEncoder[Out] ): TypedDataset[Out] = { val dropped = dataset .toDF() .drop(column.value.name) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](dropped) } /** * Drops columns as necessary to return `U` * * @example * {{{ * case class X(i: Int, j: Int, k: Boolean) * case class Y(i: Int, k: Boolean) * val f: TypedDataset[X] = ??? * val fNew: TypedDataset[Y] = f.drop[Y] * }}} * * @tparam U the output type * * @see [[frameless.TypedDataset#project]] */ def drop[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = project[U] /** Prepends a new column to the Dataset. * * {{{ * case class X(i: Int, j: Int) * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) * val fNew: TypedDataset[(Int,Int,Boolean)] = f.withColumnTupled(f('j) === 10) * }}} */ def withColumnTupled[A: TypedEncoder, H <: HList, FH <: HList, Out] (ca: TypedColumn[T, A]) (implicit i0: Generic.Aux[T, H], i1: Prepend.Aux[H, A :: HNil, FH], i2: Tupler.Aux[FH, Out], i3: TypedEncoder[Out] ): TypedDataset[Out] = { // Giving a random name to the new column (the proper name will be given by the Tuple-based encoder) val selected = dataset.toDF().withColumn("I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI", ca.untyped) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](selected) } /** Returns a new [[frameless.TypedDataset]] with the specified column updated with a new value * {{{ * case class X(i: Int, j: Int) * val f: TypedDataset[X] = TypedDataset.create(X(1,10) :: Nil) * val fNew: TypedDataset[X] = f.withColumn('j, f('i)) // results in X(1, 1) :: Nil * }}} * @param column column given as a symbol to replace * @param replacement column to replace the value with * @param i0 Evidence that a column with the correct type and name exists */ def withColumnReplaced[A]( column: Witness.Lt[Symbol], replacement: TypedColumn[T, A] )(implicit i0: TypedColumn.Exists[T, column.T, A] ): TypedDataset[T] = { val updated = dataset.toDF().withColumn(column.value.name, replacement.untyped) .as[T](TypedExpressionEncoder[T]) TypedDataset.create[T](updated) } /** Adds a column to a Dataset so long as the specified output type, `U`, has * an extra column from `T` that has type `A`. * * @example * {{{ * case class X(i: Int, j: Int) * case class Y(i: Int, j: Int, k: Boolean) * val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) * val fNew: TypedDataset[Y] = f.withColumn[Y](f('j) === 10) * }}} * @param ca The typed column to add * @param i0 TypeEncoder for output type U * @param i1 TypeEncoder for added column type A * @param i2 the LabelledGeneric derived for T * @param i3 the LabelledGeneric derived for U * @param i4 proof no fields have been removed * @param i5 diff from T to U * @param i6 keys from newFields * @param i7 the one and only new key * @param i8 the one and only new field enforcing the type of A exists * @param i9 the keys of U * @param iA allows for traversing the keys of U * @tparam U the output type * @tparam A The added column type * @tparam TRep shapeless' record representation of T * @tparam URep shapeless' record representation of U * @tparam UKeys the keys of U as an HList * @tparam NewFields the added fields to T to get U * @tparam NewKeys the keys of NewFields as an HList * @tparam NewKey the first, and only, key in NewKey * * @see [[frameless.TypedDataset.WithColumnApply#apply]] */ def withColumn[U] = new WithColumnApply[U] class WithColumnApply[U] { def apply[A, TRep <: HList, URep <: HList, UKeys <: HList, NewFields <: HList, NewKeys <: HList, NewKey <: Symbol] (ca: TypedColumn[T, A]) (implicit i0: TypedEncoder[U], i1: TypedEncoder[A], i2: LabelledGeneric.Aux[T, TRep], i3: LabelledGeneric.Aux[U, URep], i4: Diff.Aux[TRep, URep, HNil], i5: Diff.Aux[URep, TRep, NewFields], i6: Keys.Aux[NewFields, NewKeys], i7: IsHCons.Aux[NewKeys, NewKey, HNil], i8: IsHCons.Aux[NewFields, FieldType[NewKey, A], HNil], i9: Keys.Aux[URep, UKeys], iA: ToTraversable.Aux[UKeys, Seq, Symbol] ): TypedDataset[U] = { val newColumnName = i7.head(i6()).name val dfWithNewColumn = dataset .toDF() .withColumn(newColumnName, ca.untyped) val newColumns = i9.apply().to[Seq].map(_.name).map(dfWithNewColumn.col) val selected = dfWithNewColumn .select(newColumns: _*) .as[U](TypedExpressionEncoder[U]) TypedDataset.create[U](selected) } } /** * Explodes a single column at a time. It only compiles if the type of column supports this operation. * * @example * * {{{ * case class X(i: Int, j: Array[Int]) * case class Y(i: Int, j: Int) * * val f: TypedDataset[X] = ??? * val fNew: TypedDataset[Y] = f.explode('j).as[Y] * }}} * @param column the column we wish to explode */ def explode[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out] (column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, V[A]], i1: TypedEncoder[A], i2: CatalystExplodableCollection[V], i3: LabelledGeneric.Aux[T, TRep], i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], i5: Values.Aux[OutMod, OutModValues], i6: Tupler.Aux[OutModValues, Out], i7: TypedEncoder[Out] ): TypedDataset[Out] = { import org.apache.spark.sql.functions.{explode => sparkExplode} val df = dataset.toDF() val trans = df .withColumn(column.value.name, sparkExplode(df(column.value.name))) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](trans) } /** * Explodes a single column at a time. It only compiles if the type of column supports this operation. * * @example * * {{{ * case class X(i: Int, j: Map[Int, Int]) * case class Y(i: Int, j: (Int, Int)) * * val f: TypedDataset[X] = ??? * val fNew: TypedDataset[Y] = f.explodeMap('j).as[Y] * }}} * @param column the column we wish to explode */ def explodeMap[A, B, V[_, _], TRep <: HList, OutMod <: HList, OutModValues <: HList, Out] (column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, V[A, B]], i1: TypedEncoder[A], i2: TypedEncoder[B], i3: LabelledGeneric.Aux[T, TRep], i4: Modifier.Aux[TRep, column.T, V[A,B], (A, B), OutMod], i5: Values.Aux[OutMod, OutModValues], i6: Tupler.Aux[OutModValues, Out], i7: TypedEncoder[Out] ): TypedDataset[Out] = { import org.apache.spark.sql.functions.{explode => sparkExplode, struct => sparkStruct, col => sparkCol} val df = dataset.toDF() // select all columns, all original columns and [key, value] columns appeared after the map explode // .withColumn(column.value.name, sparkExplode(df(column.value.name))) in this case would not work // since the map explode produces two columns val columnNames = df.columns.toSeq val columnNamesRenamed = columnNames.map(c => s"frameless_$c") // preserve the original list of renamed columns val columns = columnNamesRenamed.map(sparkCol) val columnRenamed = s"frameless_${column.value.name}" // explode of a map adds "key" and "value" columns into the Row // this may cause col namings collision: row could already contain key / value columns // we rename the original Row columns to avoid this collision val dfr = df.toDF(columnNamesRenamed: _*) val exploded = dfr.select(sparkCol("*"), sparkExplode(dfr(columnRenamed))) val trans = exploded // map explode explodes it into [key, value] columns // the only way to put it into a column is to create a struct .withColumn(columnRenamed, sparkStruct(exploded("key"), exploded("value"))) // selecting only original columns, we don't need [key, value] columns left in the DataFrame after the map explode .select(columns: _*) // rename columns back and form the result .toDF(columnNames: _*) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](trans) } /** * Flattens a column of type Option[A]. Compiles only if the selected column is of type Option[A]. * * * @example * * {{{ * case class X(i: Int, j: Option[Int]) * case class Y(i: Int, j: Int) * * val f: TypedDataset[X] = ??? * val fNew: TypedDataset[Y] = f.flattenOption('j).as[Y] * }}} * * @param column the column we wish to flatten */ def flattenOption[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out] (column: Witness.Lt[Symbol]) (implicit i0: TypedColumn.Exists[T, column.T, V[A]], i1: TypedEncoder[A], i2: V[A] =:= Option[A], i3: LabelledGeneric.Aux[T, TRep], i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod], i5: Values.Aux[OutMod, OutModValues], i6: Tupler.Aux[OutModValues, Out], i7: TypedEncoder[Out] ): TypedDataset[Out] = { val df = dataset.toDF() val trans = df.filter(df(column.value.name).isNotNull). as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](trans) } } object TypedDataset { def create[A](data: Seq[A]) (implicit encoder: TypedEncoder[A], sqlContext: SparkSession ): TypedDataset[A] = { val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) TypedDataset.create[A](dataset) } def create[A](data: RDD[A]) (implicit encoder: TypedEncoder[A], sqlContext: SparkSession ): TypedDataset[A] = { val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A]) TypedDataset.create[A](dataset) } def create[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = createUnsafe(dataset.toDF()) /** * Creates a [[frameless.TypedDataset]] from a Spark [[org.apache.spark.sql.DataFrame]]. * Note that the names and types need to align! * * This is an unsafe operation: If the schemas do not align, * the error will be captured at runtime (not during compilation). */ def createUnsafe[A: TypedEncoder](df: DataFrame): TypedDataset[A] = { val e = TypedEncoder[A] val output: Seq[Attribute] = df.queryExecution.analyzed.output val targetFields = TypedExpressionEncoder.targetStructType(e) val targetColNames: Seq[String] = targetFields.map(_.name) if (output.size != targetFields.size) { throw new IllegalStateException( s"Unsupported creation of TypedDataset with ${targetFields.size} column(s) " + s"from a DataFrame with ${output.size} columns. " + "Try to `select()` the proper columns in the right order before calling `create()`.") } // Adapt names if they are not the same (note: types still might not match) val shouldReshape = output.zip(targetColNames).exists { case (expr, colName) => expr.name != colName } val canSelect = targetColNames.toSet.subsetOf(output.map(_.name).toSet) val reshaped = if (shouldReshape && canSelect) { df.select(targetColNames.head, targetColNames.tail:_*) } else if (shouldReshape) { df.toDF(targetColNames: _*) } else { df } new TypedDataset[A](reshaped.as[A](TypedExpressionEncoder[A])) } /** Prefer `TypedDataset.create` over `TypedDataset.unsafeCreate` unless you * know what you are doing. */ @deprecated("Prefer TypedDataset.create over TypedDataset.unsafeCreate", "0.3.0") def unsafeCreate[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = { new TypedDataset[A](dataset) } } ================================================ FILE: dataset/src/main/scala/frameless/TypedDatasetForwarded.scala ================================================ package frameless import java.util import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, DataFrameWriter, SQLContext, SparkSession} import org.apache.spark.storage.StorageLevel import scala.util.Random /** This trait implements [[TypedDataset]] methods that have the same signature * than their `Dataset` equivalent. Each method simply forwards the call to the * underlying `Dataset`. * * Documentation marked "apache/spark" is thanks to apache/spark Contributors * at https://github.com/apache/spark, licensed under Apache v2.0 available at * http://www.apache.org/licenses/LICENSE-2.0 */ trait TypedDatasetForwarded[T] { self: TypedDataset[T] => override def toString: String = dataset.toString /** * Returns a `SparkSession` from this [[TypedDataset]]. */ def sparkSession: SparkSession = dataset.sparkSession /** * Returns a `SQLContext` from this [[TypedDataset]]. */ def sqlContext: SQLContext = dataset.sqlContext /** * Returns the schema of this Dataset. * * apache/spark */ def schema: StructType = dataset.schema /** Prints the schema of the underlying `Dataset` to the console in a nice tree format. * * apache/spark */ def printSchema(): Unit = dataset.printSchema() /** Prints the plans (logical and physical) to the console for debugging purposes. * * apache/spark */ def explain(extended: Boolean = false): Unit = dataset.explain(extended) /** * Returns a `QueryExecution` from this [[TypedDataset]]. * * It is the primary workflow for executing relational queries using Spark. Designed to allow easy * access to the intermediate phases of query execution for developers. * * apache/spark */ def queryExecution: QueryExecution = dataset.queryExecution /** Converts this strongly typed collection of data to generic Dataframe. In contrast to the * strongly typed objects that Dataset operations work on, a Dataframe returns generic Row * objects that allow fields to be accessed by ordinal or name. * * apache/spark */ def toDF(): DataFrame = dataset.toDF() /** Converts this [[TypedDataset]] to an RDD. * * apache/spark */ def rdd: RDD[T] = dataset.rdd /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. * * apache/spark */ def repartition(numPartitions: Int): TypedDataset[T] = TypedDataset.create(dataset.repartition(numPartitions)) /** * Get the [[TypedDataset]]'s current storage level, or StorageLevel.NONE if not persisted. * * apache/spark */ def storageLevel(): StorageLevel = dataset.storageLevel /** * Returns the content of the [[TypedDataset]] as a Dataset of JSON strings. * * apache/spark */ def toJSON: TypedDataset[String] = TypedDataset.create(dataset.toJSON) /** * Interface for saving the content of the non-streaming [[TypedDataset]] out into external storage. * * apache/spark */ def write: DataFrameWriter[T] = dataset.write /** * Interface for saving the content of the streaming Dataset out into external storage. * * apache/spark */ def writeStream: DataStreamWriter[T] = dataset.writeStream /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions. * Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of * the 100 new partitions will claim 10 of the current partitions. * * apache/spark */ def coalesce(numPartitions: Int): TypedDataset[T] = TypedDataset.create(dataset.coalesce(numPartitions)) /** * Returns an `Array` that contains all column names in this [[TypedDataset]]. */ def columns: Array[String] = dataset.columns /** Concise syntax for chaining custom transformations. * * apache/spark */ def transform[U](t: TypedDataset[T] => TypedDataset[U]): TypedDataset[U] = t(this) /** Returns a new Dataset by taking the first `n` rows. The difference between this function * and `head` is that `head` is an action and returns an array (by triggering query execution) * while `limit` returns a new Dataset. * * apache/spark */ def limit(n: Int): TypedDataset[T] = TypedDataset.create(dataset.limit(n)) /** Returns a new [[TypedDataset]] by sampling a fraction of records. * * apache/spark */ def sample(withReplacement: Boolean, fraction: Double, seed: Long = Random.nextLong()): TypedDataset[T] = TypedDataset.create(dataset.sample(withReplacement, fraction, seed)) /** Returns a new [[TypedDataset]] that contains only the unique elements of this [[TypedDataset]]. * * Note that, equality checking is performed directly on the encoded representation of the data * and thus is not affected by a custom `equals` function defined on `T`. * * apache/spark */ def distinct: TypedDataset[T] = TypedDataset.create(dataset.distinct()) /** * Returns a best-effort snapshot of the files that compose this [[TypedDataset]]. This method simply * asks each constituent BaseRelation for its respective files and takes the union of all results. * Depending on the source relations, this may not find all input files. Duplicates are removed. * * apache/spark */ def inputFiles: Array[String] = dataset.inputFiles /** * Returns true if the `collect` and `take` methods can be run locally * (without any Spark executors). * * apache/spark */ def isLocal: Boolean = dataset.isLocal /** * Returns true if this [[TypedDataset]] contains one or more sources that continuously * return data as it arrives. A [[TypedDataset]] that reads data from a streaming source * must be executed as a `StreamingQuery` using the `start()` method in * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or * `collect()`, will throw an `AnalysisException` when there is a streaming * source present. * * apache/spark */ def isStreaming: Boolean = dataset.isStreaming /** Returns a new [[TypedDataset]] that contains only the elements of this [[TypedDataset]] that are also * present in `other`. * * Note that, equality checking is performed directly on the encoded representation of the data * and thus is not affected by a custom `equals` function defined on `T`. * * apache/spark */ def intersect(other: TypedDataset[T]): TypedDataset[T] = TypedDataset.create(dataset.intersect(other.dataset)) /** * Randomly splits this [[TypedDataset]] with the provided weights. * Weights for splits, will be normalized if they don't sum to 1. * * apache/spark */ // $COVERAGE-OFF$ We can not test this method because it is non-deterministic. def randomSplit(weights: Array[Double]): Array[TypedDataset[T]] = dataset.randomSplit(weights).map(TypedDataset.create[T]) // $COVERAGE-ON$ /** * Randomly splits this [[TypedDataset]] with the provided weights. * Weights for splits, will be normalized if they don't sum to 1. * * apache/spark */ def randomSplit(weights: Array[Double], seed: Long): Array[TypedDataset[T]] = dataset.randomSplit(weights, seed).map(TypedDataset.create[T]) /** * Returns a Java list that contains randomly split [[TypedDataset]] with the provided weights. * Weights for splits, will be normalized if they don't sum to 1. * * apache/spark */ def randomSplitAsList(weights: Array[Double], seed: Long): util.List[TypedDataset[T]] = { val values = randomSplit(weights, seed) java.util.Arrays.asList(values: _*) } /** Returns a new Dataset containing rows in this Dataset but not in another Dataset. * This is equivalent to `EXCEPT` in SQL. * * Note that, equality checking is performed directly on the encoded representation of the data * and thus is not affected by a custom `equals` function defined on `T`. * * apache/spark */ def except(other: TypedDataset[T]): TypedDataset[T] = TypedDataset.create(dataset.except(other.dataset)) /** Persist this [[TypedDataset]] with the default storage level (`MEMORY_AND_DISK`). * * apache/spark */ def cache(): TypedDataset[T] = TypedDataset.create(dataset.cache()) /** Persist this [[TypedDataset]] with the given storage level. * @param newLevel One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`, * `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc. * * apache/spark */ def persist(newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): TypedDataset[T] = TypedDataset.create(dataset.persist(newLevel)) /** Mark the [[TypedDataset]] as non-persistent, and remove all blocks for it from memory and disk. * @param blocking Whether to block until all blocks are deleted. * * apache/spark */ def unpersist(blocking: Boolean = false): TypedDataset[T] = TypedDataset.create(dataset.unpersist(blocking)) // $COVERAGE-OFF$ We do not test deprecated method since forwarded methods are tested. @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") def map[U: TypedEncoder](func: T => U): TypedDataset[U] = deserialized.map(func) @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] = deserialized.mapPartitions(func) @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] = deserialized.flatMap(func) @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") def filter(func: T => Boolean): TypedDataset[T] = deserialized.filter(func) @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0") def reduceOption[F[_]: SparkDelay](func: (T, T) => T): F[Option[T]] = deserialized.reduceOption(func) // $COVERAGE-ON$ /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. * * @example The correct way to do a projection on a single column is to * use the `select` method as follows: * * {{{ * ds: TypedDataset[(String, String, String)] -> ds.select(ds('_2)).run() * }}} * * Spark provides an alternative way to obtain the same resulting `Dataset`, * using the `map` method: * * {{{ * ds: TypedDataset[(String, String, String)] -> ds.deserialized.map(_._2).run() * }}} * * This second approach is however substantially slower than the first one, * and should be avoided as possible. Indeed, under the hood this `map` will * deserialize the entire `Tuple3` to an full JVM object, call the apply * method of the `_._2` closure on it, and serialize the resulting String back * to its Catalyst representation. */ object deserialized { /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each element. * * apache/spark */ def map[U: TypedEncoder](func: T => U): TypedDataset[U] = TypedDataset.create(self.dataset.map(func)(TypedExpressionEncoder[U])) /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each partition. * * apache/spark */ def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] = TypedDataset.create(self.dataset.mapPartitions(func)(TypedExpressionEncoder[U])) /** Returns a new [[TypedDataset]] by first applying a function to all elements of this [[TypedDataset]], * and then flattening the results. * * apache/spark */ def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] = TypedDataset.create(self.dataset.flatMap(func)(TypedExpressionEncoder[U])) /** Returns a new [[TypedDataset]] that only contains elements where `func` returns `true`. * * apache/spark */ def filter(func: T => Boolean): TypedDataset[T] = TypedDataset.create(self.dataset.filter(func)) /** Optionally reduces the elements of this [[TypedDataset]] using the specified binary function. The given * `func` must be commutative and associative or the result may be non-deterministic. * * Differs from `Dataset#reduce` by wrapping its result into an `Option` and an effect-suspending `F`. */ def reduceOption[F[_]](func: (T, T) => T)(implicit F: SparkDelay[F]): F[Option[T]] = F.delay { try { Option(self.dataset.reduce(func)) } catch { case _: UnsupportedOperationException => None } }(self.dataset.sparkSession) } } ================================================ FILE: dataset/src/main/scala/frameless/TypedEncoder.scala ================================================ package frameless import java.math.BigInteger import java.util.Date import java.time.{ Duration, Instant, Period, LocalDate } import java.sql.Timestamp import scala.reflect.ClassTag import org.apache.spark.sql.FramelessInternals import org.apache.spark.sql.FramelessInternals.UserDefinedType import org.apache.spark.sql.{ reflection => ScalaReflection } import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects._ import org.apache.spark.sql.catalyst.util.{ ArrayBasedMapData, DateTimeUtils, GenericArrayData } import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import shapeless._ import shapeless.ops.hlist.IsHCons abstract class TypedEncoder[T]( implicit val classTag: ClassTag[T]) extends Serializable { def nullable: Boolean def jvmRepr: DataType def catalystRepr: DataType /** * From Catalyst representation to T */ def fromCatalyst(path: Expression): Expression /** * T to Catalyst representation */ def toCatalyst(path: Expression): Expression } // Waiting on scala 2.12 // @annotation.implicitAmbiguous(msg = // """TypedEncoder[${T}] can be obtained from automatic type class derivation, using the implicit Injection[${T}, ?] or using the implicit UserDefinedType[${T}] in scope. // To desambigious this resolution you need to either: // - Remove the implicit Injection[${T}, ?] from scope // - Remove the implicit UserDefinedType[${T}] from scope // - import TypedEncoder.usingInjection // - import TypedEncoder.usingDerivation // - import TypedEncoder.usingUserDefinedType // """) object TypedEncoder { def apply[T: TypedEncoder]: TypedEncoder[T] = implicitly[TypedEncoder[T]] implicit val stringEncoder: TypedEncoder[String] = new TypedEncoder[String] { def nullable: Boolean = false def jvmRepr: DataType = FramelessInternals.objectTypeFor[String] def catalystRepr: DataType = StringType def toCatalyst(path: Expression): Expression = StaticInvoke(classOf[UTF8String], catalystRepr, "fromString", path :: Nil) def fromCatalyst(path: Expression): Expression = Invoke(path, "toString", jvmRepr) override val toString = "stringEncoder" } implicit val booleanEncoder: TypedEncoder[Boolean] = new TypedEncoder[Boolean] { def nullable: Boolean = false def jvmRepr: DataType = BooleanType def catalystRepr: DataType = BooleanType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val intEncoder: TypedEncoder[Int] = new TypedEncoder[Int] { def nullable: Boolean = false def jvmRepr: DataType = IntegerType def catalystRepr: DataType = IntegerType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path override def toString = "intEncoder" } implicit val longEncoder: TypedEncoder[Long] = new TypedEncoder[Long] { def nullable: Boolean = false def jvmRepr: DataType = LongType def catalystRepr: DataType = LongType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val shortEncoder: TypedEncoder[Short] = new TypedEncoder[Short] { def nullable: Boolean = false def jvmRepr: DataType = ShortType def catalystRepr: DataType = ShortType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val charEncoder: TypedEncoder[Char] = new TypedEncoder[Char] { // tricky because while Char is primitive type, Spark doesn't support it implicit val charAsString: Injection[java.lang.Character, String] = new Injection[java.lang.Character, String] { def apply(a: java.lang.Character): String = String.valueOf(a) def invert(b: String): java.lang.Character = { require(b.length == 1) b.charAt(0) } } val underlying = usingInjection[java.lang.Character, String] def nullable: Boolean = false // this line fixes underlying encoder def jvmRepr: DataType = FramelessInternals.objectTypeFor[java.lang.Character] def catalystRepr: DataType = StringType def toCatalyst(path: Expression): Expression = underlying.toCatalyst(path) def fromCatalyst(path: Expression): Expression = underlying.fromCatalyst(path) } implicit val byteEncoder: TypedEncoder[Byte] = new TypedEncoder[Byte] { def nullable: Boolean = false def jvmRepr: DataType = ByteType def catalystRepr: DataType = ByteType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val floatEncoder: TypedEncoder[Float] = new TypedEncoder[Float] { def nullable: Boolean = false def jvmRepr: DataType = FloatType def catalystRepr: DataType = FloatType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val doubleEncoder: TypedEncoder[Double] = new TypedEncoder[Double] { def nullable: Boolean = false def jvmRepr: DataType = DoubleType def catalystRepr: DataType = DoubleType def toCatalyst(path: Expression): Expression = path def fromCatalyst(path: Expression): Expression = path } implicit val bigDecimalEncoder: TypedEncoder[BigDecimal] = new TypedEncoder[BigDecimal] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigDecimal] def catalystRepr: DataType = DecimalType.SYSTEM_DEFAULT def toCatalyst(path: Expression): Expression = StaticInvoke( Decimal.getClass, DecimalType.SYSTEM_DEFAULT, "apply", path :: Nil ) def fromCatalyst(path: Expression): Expression = Invoke(path, "toBigDecimal", jvmRepr) override def toString: String = "bigDecimalEncoder" } implicit val javaBigDecimalEncoder: TypedEncoder[java.math.BigDecimal] = new TypedEncoder[java.math.BigDecimal] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[java.math.BigDecimal] def catalystRepr: DataType = DecimalType.SYSTEM_DEFAULT def toCatalyst(path: Expression): Expression = StaticInvoke( Decimal.getClass, DecimalType.SYSTEM_DEFAULT, "apply", path :: Nil ) def fromCatalyst(path: Expression): Expression = Invoke(path, "toJavaBigDecimal", jvmRepr) override def toString: String = "javaBigDecimalEncoder" } implicit val bigIntEncoder: TypedEncoder[BigInt] = new TypedEncoder[BigInt] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigInt] def catalystRepr: DataType = DecimalType(DecimalType.MAX_PRECISION, 0) def toCatalyst(path: Expression): Expression = StaticInvoke( Decimal.getClass, catalystRepr, "apply", path :: Nil ) def fromCatalyst(path: Expression): Expression = Invoke(path, "toScalaBigInt", jvmRepr) override def toString: String = "bigIntEncoder" } implicit val javaBigIntEncoder: TypedEncoder[BigInteger] = new TypedEncoder[BigInteger] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigInteger] def catalystRepr: DataType = DecimalType(DecimalType.MAX_PRECISION, 0) def toCatalyst(path: Expression): Expression = StaticInvoke( Decimal.getClass, catalystRepr, "apply", path :: Nil ) def fromCatalyst(path: Expression): Expression = Invoke(path, "toJavaBigInteger", jvmRepr) override def toString: String = "javaBigIntEncoder" } implicit val sqlDate: TypedEncoder[SQLDate] = new TypedEncoder[SQLDate] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[SQLDate] def catalystRepr: DataType = DateType def toCatalyst(path: Expression): Expression = Invoke(path, "days", DateType) def fromCatalyst(path: Expression): Expression = StaticInvoke( staticObject = SQLDate.getClass, dataType = jvmRepr, functionName = "apply", arguments = path :: Nil, propagateNull = true ) } implicit val timestampEncoder: TypedEncoder[Timestamp] = new TypedEncoder[Timestamp] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[Timestamp] def catalystRepr: DataType = TimestampType def toCatalyst(path: Expression): Expression = StaticInvoke( DateTimeUtils.getClass, TimestampType, "fromJavaTimestamp", path :: Nil, returnNullable = false ) def fromCatalyst(path: Expression): Expression = StaticInvoke( staticObject = DateTimeUtils.getClass, dataType = jvmRepr, functionName = "toJavaTimestamp", arguments = path :: Nil, propagateNull = true ) override def toString: String = "timestampEncoder" } implicit val dateEncoder: TypedEncoder[Date] = new TypedEncoder[Date] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[Date] def catalystRepr: DataType = TimestampType private val instantRepr = ScalaReflection.dataTypeFor[Instant] def toCatalyst(path: Expression): Expression = timeInstant.toCatalyst(Invoke(path, "toInstant", instantRepr)) def fromCatalyst(path: Expression): Expression = StaticInvoke( staticObject = classOf[Date], dataType = jvmRepr, functionName = "from", arguments = timeInstant.fromCatalyst(path) :: Nil, propagateNull = true ) override def toString: String = "dateEncoder" } implicit val sqlDateEncoder: TypedEncoder[java.sql.Date] = new TypedEncoder[java.sql.Date] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[java.sql.Date] def catalystRepr: DataType = DateType def toCatalyst(path: Expression): Expression = StaticInvoke( staticObject = DateTimeUtils.getClass, dataType = catalystRepr, functionName = "fromJavaDate", arguments = path :: Nil, propagateNull = true ) private val localDateRepr = ScalaReflection.dataTypeFor[LocalDate] def fromCatalyst(path: Expression): Expression = { val toLocalDate = StaticInvoke( staticObject = DateTimeUtils.getClass, dataType = localDateRepr, functionName = "daysToLocalDate", arguments = path :: Nil, propagateNull = true ) StaticInvoke( staticObject = classOf[java.sql.Date], dataType = jvmRepr, functionName = "valueOf", arguments = toLocalDate :: Nil, propagateNull = true ) } override def toString: String = "sqlDateEncoder" } implicit val sqlTimestamp: TypedEncoder[SQLTimestamp] = new TypedEncoder[SQLTimestamp] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[SQLTimestamp] def catalystRepr: DataType = TimestampType def toCatalyst(path: Expression): Expression = Invoke(path, "us", TimestampType) def fromCatalyst(path: Expression): Expression = StaticInvoke( staticObject = SQLTimestamp.getClass, dataType = jvmRepr, functionName = "apply", arguments = path :: Nil, propagateNull = true ) } /** java.time Encoders, Spark uses https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala for encoding / decoding. */ implicit val timeInstant: TypedEncoder[Instant] = new TypedEncoder[Instant] { def nullable: Boolean = false def jvmRepr: DataType = ScalaReflection.dataTypeFor[Instant] def catalystRepr: DataType = TimestampType def toCatalyst(path: Expression): Expression = StaticInvoke( DateTimeUtils.getClass, TimestampType, "instantToMicros", path :: Nil, returnNullable = false ) def fromCatalyst(path: Expression): Expression = StaticInvoke( staticObject = DateTimeUtils.getClass, dataType = jvmRepr, functionName = "microsToInstant", arguments = path :: Nil, propagateNull = true ) } /** * DayTimeIntervalType and YearMonthIntervalType in Spark 3.2.0. * We maintain Spark 3.x cross compilation and handle Duration and Period as an injections to be compatible with Spark versions < 3.2 * See * * https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala#L1031-L1047 * * https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala#L1075-L1087 */ // DayTimeIntervalType implicit val timeDurationInjection: Injection[Duration, Long] = Injection(_.toMillis, Duration.ofMillis) // YearMonthIntervalType implicit val timePeriodInjection: Injection[Period, Int] = Injection(_.getDays, Period.ofDays) implicit val timePeriodEncoder: TypedEncoder[Period] = TypedEncoder.usingInjection implicit val timeDurationEncoder: TypedEncoder[Duration] = TypedEncoder.usingInjection implicit def arrayEncoder[T: ClassTag]( implicit i0: Lazy[RecordFieldEncoder[T]] ): TypedEncoder[Array[T]] = new TypedEncoder[Array[T]] { private lazy val encodeT = i0.value.encoder def nullable: Boolean = false lazy val jvmRepr: DataType = i0.value.jvmRepr match { case ByteType => BinaryType case _ => FramelessInternals.objectTypeFor[Array[T]] } lazy val catalystRepr: DataType = i0.value.jvmRepr match { case ByteType => BinaryType case _ => ArrayType(encodeT.catalystRepr, encodeT.nullable) } def toCatalyst(path: Expression): Expression = { val enc = i0.value enc.jvmRepr match { case IntegerType | LongType | DoubleType | FloatType | ShortType | BooleanType => StaticInvoke( classOf[UnsafeArrayData], catalystRepr, "fromPrimitiveArray", path :: Nil ) case ByteType => path case _ => MapObjects(enc.toCatalyst, path, enc.jvmRepr, encodeT.nullable) } } def fromCatalyst(path: Expression): Expression = encodeT.jvmRepr match { case IntegerType => Invoke(path, "toIntArray", jvmRepr) case LongType => Invoke(path, "toLongArray", jvmRepr) case DoubleType => Invoke(path, "toDoubleArray", jvmRepr) case FloatType => Invoke(path, "toFloatArray", jvmRepr) case ShortType => Invoke(path, "toShortArray", jvmRepr) case BooleanType => Invoke(path, "toBooleanArray", jvmRepr) case ByteType => path case _ => Invoke( MapObjects( i0.value.fromCatalyst, path, encodeT.catalystRepr, encodeT.nullable ), "array", jvmRepr ) } override def toString: String = s"arrayEncoder($jvmRepr)" } implicit def collectionEncoder[C[X] <: Seq[X], T]( implicit i0: Lazy[RecordFieldEncoder[T]], i1: ClassTag[C[T]] ): TypedEncoder[C[T]] = new TypedEncoder[C[T]] { private lazy val encodeT = i0.value.encoder def nullable: Boolean = false def jvmRepr: DataType = FramelessInternals.objectTypeFor[C[T]](i1) def catalystRepr: DataType = ArrayType(encodeT.catalystRepr, encodeT.nullable) def toCatalyst(path: Expression): Expression = { val enc = i0.value if (ScalaReflection.isNativeType(enc.jvmRepr)) { NewInstance(classOf[GenericArrayData], path :: Nil, catalystRepr) } else { MapObjects(enc.toCatalyst, path, enc.jvmRepr, encodeT.nullable) } } def fromCatalyst(path: Expression): Expression = MapObjects( i0.value.fromCatalyst, path, encodeT.catalystRepr, encodeT.nullable, Some(i1.runtimeClass) // This will cause MapObjects to build a collection of type C[_] directly ) override def toString: String = s"collectionEncoder($jvmRepr)" } /** * @param i1 implicit lazy `RecordFieldEncoder[T]` to encode individual elements of the set. * @param i2 implicit `ClassTag[Set[T]]` to provide runtime information about the set type. * @tparam T the element type of the set. * @return a `TypedEncoder` instance for `Set[T]`. */ implicit def setEncoder[T]( implicit i1: shapeless.Lazy[RecordFieldEncoder[T]], i2: ClassTag[Set[T]] ): TypedEncoder[Set[T]] = { implicit val inj: Injection[Set[T], Seq[T]] = Injection(_.toSeq, _.toSet) TypedEncoder.usingInjection } /** * @tparam A the key type * @tparam B the value type * @param i0 the keys encoder * @param i1 the values encoder */ implicit def mapEncoder[A: NotCatalystNullable, B]( implicit i0: Lazy[RecordFieldEncoder[A]], i1: Lazy[RecordFieldEncoder[B]] ): TypedEncoder[Map[A, B]] = new TypedEncoder[Map[A, B]] { def nullable: Boolean = false def jvmRepr: DataType = FramelessInternals.objectTypeFor[Map[A, B]] private lazy val encodeA = i0.value.encoder private lazy val encodeB = i1.value.encoder lazy val catalystRepr: DataType = MapType(encodeA.catalystRepr, encodeB.catalystRepr, encodeB.nullable) def fromCatalyst(path: Expression): Expression = { val keyArrayType = ArrayType(encodeA.catalystRepr, containsNull = false) val keyData = Invoke( MapObjects( i0.value.fromCatalyst, Invoke(path, "keyArray", keyArrayType), encodeA.catalystRepr ), "array", FramelessInternals.objectTypeFor[Array[Any]] ) val valueArrayType = ArrayType(encodeB.catalystRepr, encodeB.nullable) val valueData = Invoke( MapObjects( i1.value.fromCatalyst, Invoke(path, "valueArray", valueArrayType), encodeB.catalystRepr ), "array", FramelessInternals.objectTypeFor[Array[Any]] ) StaticInvoke( ArrayBasedMapData.getClass, jvmRepr, "toScalaMap", keyData :: valueData :: Nil ) } def toCatalyst(path: Expression): Expression = { val encA = i0.value val encB = i1.value ExternalMapToCatalyst( path, encA.jvmRepr, encA.toCatalyst, false, encB.jvmRepr, encB.toCatalyst, encodeB.nullable ) } override def toString = s"mapEncoder($jvmRepr)" } implicit def optionEncoder[A]( implicit underlying: TypedEncoder[A] ): TypedEncoder[Option[A]] = new TypedEncoder[Option[A]] { def nullable: Boolean = true def jvmRepr: DataType = FramelessInternals.objectTypeFor[Option[A]](classTag) def catalystRepr: DataType = underlying.catalystRepr def toCatalyst(path: Expression): Expression = { // for primitive types we must manually unbox the value of the object underlying.jvmRepr match { case IntegerType => Invoke( UnwrapOption( ScalaReflection.dataTypeFor[java.lang.Integer], path ), "intValue", IntegerType ) case LongType => Invoke( UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Long], path), "longValue", LongType ) case DoubleType => Invoke( UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Double], path), "doubleValue", DoubleType ) case FloatType => Invoke( UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Float], path), "floatValue", FloatType ) case ShortType => Invoke( UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Short], path), "shortValue", ShortType ) case ByteType => Invoke( UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Byte], path), "byteValue", ByteType ) case BooleanType => Invoke( UnwrapOption( ScalaReflection.dataTypeFor[java.lang.Boolean], path ), "booleanValue", BooleanType ) case _ => underlying.toCatalyst(UnwrapOption(underlying.jvmRepr, path)) } } def fromCatalyst(path: Expression): Expression = WrapOption(underlying.fromCatalyst(path), underlying.jvmRepr) } /** Encodes things using injection if there is one defined */ implicit def usingInjection[A: ClassTag, B]( implicit inj: Injection[A, B], trb: TypedEncoder[B] ): TypedEncoder[A] = new TypedEncoder[A] { def nullable: Boolean = trb.nullable def jvmRepr: DataType = FramelessInternals.objectTypeFor[A](classTag) def catalystRepr: DataType = trb.catalystRepr def fromCatalyst(path: Expression): Expression = { val bexpr = trb.fromCatalyst(path) Invoke(Literal.fromObject(inj), "invert", jvmRepr, Seq(bexpr)) } def toCatalyst(path: Expression): Expression = trb.toCatalyst( Invoke(Literal.fromObject(inj), "apply", trb.jvmRepr, Seq(path)) ) } /** Encodes things as records if there is no Injection defined */ implicit def usingDerivation[F, G <: HList, H <: HList]( implicit i0: LabelledGeneric.Aux[F, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons[H], i3: Lazy[RecordEncoderFields[H]], i4: Lazy[NewInstanceExprs[G]], i5: ClassTag[F] ): TypedEncoder[F] = new RecordEncoder[F, G, H] /** Encodes things using a Spark SQL's User Defined Type (UDT) if there is one defined in implicit */ implicit def usingUserDefinedType[ A >: Null: UserDefinedType: ClassTag ]: TypedEncoder[A] = { val udt = implicitly[UserDefinedType[A]] val udtInstance = NewInstance(udt.getClass, Nil, dataType = ObjectType(udt.getClass)) new TypedEncoder[A] { def nullable: Boolean = false def jvmRepr: DataType = ObjectType(udt.userClass) def catalystRepr: DataType = udt def toCatalyst(path: Expression): Expression = Invoke(udtInstance, "serialize", udt, Seq(path)) def fromCatalyst(path: Expression): Expression = Invoke(udtInstance, "deserialize", ObjectType(udt.userClass), Seq(path)) } } object injections extends InjectionEnum } ================================================ FILE: dataset/src/main/scala/frameless/TypedExpressionEncoder.scala ================================================ package frameless import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, If} import org.apache.spark.sql.types.StructType object TypedExpressionEncoder { /** In Spark, DataFrame has always schema of StructType * * DataFrames of primitive types become records * with a single field called "value" set in ExpressionEncoder. */ def targetStructType[A](encoder: TypedEncoder[A]): StructType = encoder.catalystRepr match { case x: StructType => if (encoder.nullable) StructType(x.fields.map(_.copy(nullable = true))) else x case dt => new StructType().add("value", dt, nullable = encoder.nullable) } def apply[T](implicit encoder: TypedEncoder[T]): Encoder[T] = { val in = BoundReference(0, encoder.jvmRepr, encoder.nullable) val (out, serializer) = encoder.toCatalyst(in) match { case it @ If(_, _, _: CreateNamedStruct) => { val out = GetColumnByOrdinal(0, encoder.catalystRepr) out -> it } case other => { val out = GetColumnByOrdinal(0, encoder.catalystRepr) out -> other } } new ExpressionEncoder[T]( objSerializer = serializer, objDeserializer = encoder.fromCatalyst(out), clsTag = encoder.classTag ) } } ================================================ FILE: dataset/src/main/scala/frameless/With.scala ================================================ package frameless /** Compute the intersection of two types: * * - With[A, A] = A * - With[A, B] = A with B (when A != B) * * This type function is needed to prevent IDEs from infering large types * with shape `A with A with ... with A`. These types could be confusing for * both end users and IDE's type checkers. */ trait With[A, B] { type Out } object With extends LowPrioWith { implicit def combine[A, B]: Aux[A, B, A with B] = of[A, B, A with B] } private[frameless] sealed trait LowPrioWith { type Aux[A, B, W] = With[A, B] { type Out = W } protected[this] val theInstance = new With[Any, Any] {} protected[this] def of[A, B, W]: With[A, B] { type Out = W } = theInstance.asInstanceOf[Aux[A, B, W]] implicit def identity[T]: Aux[T, T, T] = of[T, T, T] } ================================================ FILE: dataset/src/main/scala/frameless/functions/AggregateFunctions.scala ================================================ package frameless package functions import org.apache.spark.sql.FramelessInternals.expr import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.{functions => sparkFunctions} import frameless.syntax._ import scala.annotation.nowarn trait AggregateFunctions { /** Aggregate function: returns the number of items in a group. * * apache/spark */ def count[T](): TypedAggregate[T, Long] = sparkFunctions.count(sparkFunctions.lit(1)).typedAggregate /** Aggregate function: returns the number of items in a group for which the selected column is not null. * * apache/spark */ def count[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] = sparkFunctions.count(column.untyped).typedAggregate /** Aggregate function: returns the number of distinct items in a group. * * apache/spark */ def countDistinct[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] = sparkFunctions.countDistinct(column.untyped).typedAggregate /** Aggregate function: returns the approximate number of distinct items in a group. */ def approxCountDistinct[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] = sparkFunctions.approx_count_distinct(column.untyped).typedAggregate /** Aggregate function: returns the approximate number of distinct items in a group. * * @param rsd maximum estimation error allowed (default = 0.05) * * apache/spark */ def approxCountDistinct[T](column: TypedColumn[T, _], rsd: Double): TypedAggregate[T, Long] = sparkFunctions.approx_count_distinct(column.untyped, rsd).typedAggregate /** Aggregate function: returns a list of objects with duplicates. * * apache/spark */ def collectList[T, A: TypedEncoder](column: TypedColumn[T, A]): TypedAggregate[T, Vector[A]] = sparkFunctions.collect_list(column.untyped).typedAggregate /** Aggregate function: returns a set of objects with duplicate elements eliminated. * * apache/spark */ def collectSet[T, A: TypedEncoder](column: TypedColumn[T, A]): TypedAggregate[T, Vector[A]] = sparkFunctions.collect_set(column.untyped).typedAggregate /** Aggregate function: returns the sum of all values in the given column. * * apache/spark */ def sum[A, T, Out](column: TypedColumn[T, A])( implicit summable: CatalystSummable[A, Out], oencoder: TypedEncoder[Out], aencoder: TypedEncoder[A] ): TypedAggregate[T, Out] = { val zeroExpr = Literal.create(summable.zero, TypedEncoder[A].catalystRepr) val sumExpr = expr(sparkFunctions.sum(column.untyped)) val sumOrZero = Coalesce(Seq(sumExpr, zeroExpr)) new TypedAggregate[T, Out](sumOrZero) } /** Aggregate function: returns the sum of distinct values in the column. * * apache/spark */ @nowarn // supress sparkFunctions.sumDistinct call which is used to maintain Spark 3.1.x backwards compat def sumDistinct[A, T, Out](column: TypedColumn[T, A])( implicit summable: CatalystSummable[A, Out], oencoder: TypedEncoder[Out], aencoder: TypedEncoder[A] ): TypedAggregate[T, Out] = { val zeroExpr = Literal.create(summable.zero, TypedEncoder[A].catalystRepr) val sumExpr = expr(sparkFunctions.sumDistinct(column.untyped)) val sumOrZero = Coalesce(Seq(sumExpr, zeroExpr)) new TypedAggregate[T, Out](sumOrZero) } /** Aggregate function: returns the average of the values in a group. * * apache/spark */ def avg[A, T, Out](column: TypedColumn[T, A])( implicit averageable: CatalystAverageable[A, Out], oencoder: TypedEncoder[Out] ): TypedAggregate[T, Out] = { new TypedAggregate[T, Out](sparkFunctions.avg(column.untyped)) } /** Aggregate function: returns the unbiased variance of the values in a group. * * @note In Spark variance always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#186]] * * apache/spark */ def variance[A: CatalystVariance, T](column: TypedColumn[T, A]): TypedAggregate[T, Double] = sparkFunctions.variance(column.untyped).typedAggregate /** Aggregate function: returns the sample standard deviation. * * @note In Spark stddev always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#155]] * * apache/spark */ def stddev[A: CatalystVariance, T](column: TypedColumn[T, A]): TypedAggregate[T, Double] = sparkFunctions.stddev(column.untyped).typedAggregate /** * Aggregate function: returns the standard deviation of a column by population. * * @note In Spark stddev always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L143]] * * apache/spark */ def stddevPop[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.stddev_pop(column.cast[Double].untyped) ) } /** * Aggregate function: returns the standard deviation of a column by sample. * * @note In Spark stddev always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L160]] * * apache/spark */ def stddevSamp[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double] ): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.stddev_samp(column.cast[Double].untyped) ) } /** Aggregate function: returns the maximum value of the column in a group. * * apache/spark */ def max[A: CatalystOrdered, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = { implicit val c = column.uencoder sparkFunctions.max(column.untyped).typedAggregate } /** Aggregate function: returns the minimum value of the column in a group. * * apache/spark */ def min[A: CatalystOrdered, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = { implicit val c = column.uencoder sparkFunctions.min(column.untyped).typedAggregate } /** Aggregate function: returns the first value in a group. * * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * * apache/spark */ def first[A, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = { sparkFunctions.first(column.untyped).typedAggregate(column.uencoder) } /** * Aggregate function: returns the last value in a group. * * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * * apache/spark */ def last[A, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = { implicit val c = column.uencoder sparkFunctions.last(column.untyped).typedAggregate } /** * Aggregate function: returns the Pearson Correlation Coefficient for two columns. * * @note In Spark corr always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala#L95]] * * apache/spark */ def corr[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B]) (implicit i0: CatalystCast[A, Double], i1: CatalystCast[B, Double] ): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.corr(column1.cast[Double].untyped, column2.cast[Double].untyped) ) } /** * Aggregate function: returns the covariance of two collumns. * * @note In Spark covar_pop always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala#L82]] * * apache/spark */ def covarPop[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B]) (implicit i0: CatalystCast[A, Double], i1: CatalystCast[B, Double] ): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.covar_pop(column1.cast[Double].untyped, column2.cast[Double].untyped) ) } /** * Aggregate function: returns the covariance of two columns. * * @note In Spark covar_samp always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala#L93]] * * apache/spark */ def covarSamp[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B]) (implicit i0: CatalystCast[A, Double], i1: CatalystCast[B, Double] ): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.covar_samp(column1.cast[Double].untyped, column2.cast[Double].untyped) ) } /** * Aggregate function: returns the kurtosis of a column. * * @note In Spark kurtosis always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L220]] * * apache/spark */ def kurtosis[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.kurtosis(column.cast[Double].untyped) ) } /** * Aggregate function: returns the skewness of a column. * * @note In Spark skewness always returns Double * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L200]] * * apache/spark */ def skewness[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = { new TypedAggregate[T, Option[Double]]( sparkFunctions.skewness(column.cast[Double].untyped) ) } } ================================================ FILE: dataset/src/main/scala/frameless/functions/Lit.scala ================================================ package frameless.functions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression} import org.apache.spark.sql.types.DataType private[frameless] case class Lit[T <: AnyVal]( dataType: DataType, nullable: Boolean, show: () => String, catalystExpr: Expression // must be a generated Expression from a literal TypedEncoder's toCatalyst function ) extends Expression with NonSQLExpression { override def toString: String = s"FramelessLit(${show()})" lazy val codegen = { val ctx = new CodegenContext() val eval = genCode(ctx) val codeBody = s""" public scala.Function1 generate(Object[] references) { return new LiteralEvalImpl(references); } class LiteralEvalImpl extends scala.runtime.AbstractFunction1 { private final Object[] references; ${ctx.declareMutableStates()} ${ctx.declareAddedFunctions()} public LiteralEvalImpl(Object[] references) { this.references = references; ${ctx.initMutableStates()} } public java.lang.Object apply(java.lang.Object z) { InternalRow ${ctx.INPUT_ROW} = (InternalRow) z; ${eval.code} return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value}); } } """ val code = CodeFormatter.stripOverlappingComments( new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()) ) val (clazz, _) = CodeGenerator.compile(code) val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef] codegen } def eval(input: InternalRow): Any = codegen(input) def children: Seq[Expression] = Nil protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = catalystExpr.genCode(ctx) protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = this override val foldable: Boolean = catalystExpr.foldable } ================================================ FILE: dataset/src/main/scala/frameless/functions/NonAggregateFunctions.scala ================================================ package frameless package functions import org.apache.spark.sql.{Column, functions => sparkFunctions} import scala.annotation.nowarn import scala.util.matching.Regex trait NonAggregateFunctions { /** Non-Aggregate function: calculates the SHA-2 digest of a binary column and returns the value as a 40 character hex string * * apache/spark */ def sha2[T](column: AbstractTypedColumn[T, Array[Byte]], numBits: Int): column.ThisType[T, String] = column.typed(sparkFunctions.sha2(column.untyped, numBits)) /** Non-Aggregate function: calculates the SHA-1 digest of a binary column and returns the value as a 40 character hex string * * apache/spark */ def sha1[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, String] = column.typed(sparkFunctions.sha1(column.untyped)) /** Non-Aggregate function: returns a cyclic redundancy check value of a binary column as long. * * apache/spark */ def crc32[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, Long] = column.typed(sparkFunctions.crc32(column.untyped)) /** * Non-Aggregate function: returns the negated value of column. * * apache/spark */ def negate[A, B, T](column: AbstractTypedColumn[T,A])( implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T,B] = column.typed(sparkFunctions.negate(column.untyped)) /** * Non-Aggregate function: logical not. * * apache/spark */ def not[T](column: AbstractTypedColumn[T,Boolean]): column.ThisType[T,Boolean] = column.typed(sparkFunctions.not(column.untyped)) /** * Non-Aggregate function: Convert a number in a string column from one base to another. * * apache/spark */ def conv[T](column: AbstractTypedColumn[T,String], fromBase: Int, toBase: Int): column.ThisType[T,String] = column.typed(sparkFunctions.conv(column.untyped,fromBase,toBase)) /** Non-Aggregate function: Converts an angle measured in radians to an approximately equivalent angle measured in degrees. * * apache/spark */ def degrees[A,T](column: AbstractTypedColumn[T,A]): column.ThisType[T,Double] = column.typed(sparkFunctions.degrees(column.untyped)) /** Non-Aggregate function: returns the ceiling of a numeric column * * apache/spark */ def ceil[A, B, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystRound[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.ceil(column.untyped))(i1) /** Non-Aggregate function: returns the floor of a numeric column * * apache/spark */ def floor[A, B, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystRound[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.floor(column.untyped))(i1) /** Non-Aggregate function: unsigned shift the the given value numBits right. If given long, will return long else it will return an integer. * * apache/spark */ @nowarn // supress sparkFunctions.shiftRightUnsigned call which is used to maintain Spark 3.1.x backwards compat def shiftRightUnsigned[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int) (implicit i0: CatalystBitShift[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.shiftRightUnsigned(column.untyped, numBits)) /** Non-Aggregate function: shift the the given value numBits right. If given long, will return long else it will return an integer. * * apache/spark */ @nowarn // supress sparkFunctions.shiftReft call which is used to maintain Spark 3.1.x backwards compat def shiftRight[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int) (implicit i0: CatalystBitShift[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.shiftRight(column.untyped, numBits)) /** Non-Aggregate function: shift the the given value numBits left. If given long, will return long else it will return an integer. * * apache/spark */ @nowarn // supress sparkFunctions.shiftLeft call which is used to maintain Spark 3.1.x backwards compat def shiftLeft[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int) (implicit i0: CatalystBitShift[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.shiftLeft(column.untyped, numBits)) /** Non-Aggregate function: returns the absolute value of a numeric column * * apache/spark */ def abs[A, B, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.abs(column.untyped))(i1) /** Non-Aggregate function: Computes the cosine of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def cos[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.cos(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the hyperbolic cosine of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def cosh[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.cosh(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the signum of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def signum[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.signum(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the sine of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def sin[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.sin(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the hyperbolic sine of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def sinh[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.sinh(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the tangent of the given column. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def tan[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.tan(column.cast[Double].untyped)) /** Non-Aggregate function: Computes the hyperbolic tangent of the given value. * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def tanh[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.tanh(column.cast[Double].untyped)) /** Non-Aggregate function: returns the acos of a numeric column * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def acos[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.acos(column.cast[Double].untyped)) /** Non-Aggregate function: returns true if value is contained with in the array in the specified column * * apache/spark */ def arrayContains[C[_]: CatalystCollection, A, T](column: AbstractTypedColumn[T, C[A]], value: A): column.ThisType[T, Boolean] = column.typed(sparkFunctions.array_contains(column.untyped, value)) /** Non-Aggregate function: returns the atan of a numeric column * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def atan[A, T](column: AbstractTypedColumn[T,A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.atan(column.cast[Double].untyped)) /** Non-Aggregate function: returns the asin of a numeric column * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def asin[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.asin(column.cast[Double].untyped)) /** Non-Aggregate function: returns the angle theta from the conversion of rectangular coordinates (x, y) to * polar coordinates (r, theta). * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def atan2[A, B, T](l: TypedColumn[T, A], r: TypedColumn[T, B]) (implicit i0: CatalystCast[A, Double], i1: CatalystCast[B, Double] ): TypedColumn[T, Double] = r.typed(sparkFunctions.atan2(l.cast[Double].untyped, r.cast[Double].untyped)) /** Non-Aggregate function: returns the angle theta from the conversion of rectangular coordinates (x, y) to * polar coordinates (r, theta). * * Spark will expect a Double value for this expression. See: * [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]] * apache/spark */ def atan2[A, B, T](l: TypedAggregate[T, A], r: TypedAggregate[T, B]) (implicit i0: CatalystCast[A, Double], i1: CatalystCast[B, Double] ): TypedAggregate[T, Double] = r.typed(sparkFunctions.atan2(l.cast[Double].untyped, r.cast[Double].untyped)) def atan2[B, T](l: Double, r: TypedColumn[T, B]) (implicit i0: CatalystCast[B, Double]): TypedColumn[T, Double] = atan2(r.lit(l), r) def atan2[A, T](l: TypedColumn[T, A], r: Double) (implicit i0: CatalystCast[A, Double]): TypedColumn[T, Double] = atan2(l, l.lit(r)) def atan2[B, T](l: Double, r: TypedAggregate[T, B]) (implicit i0: CatalystCast[B, Double]): TypedAggregate[T, Double] = atan2(r.lit(l), r) def atan2[A, T](l: TypedAggregate[T, A], r: Double) (implicit i0: CatalystCast[A, Double]): TypedAggregate[T, Double] = atan2(l, l.lit(r)) /** Non-Aggregate function: returns the square root value of a numeric column. * * apache/spark */ def sqrt[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.sqrt(column.cast[Double].untyped)) /** Non-Aggregate function: returns the cubic root value of a numeric column. * * apache/spark */ def cbrt[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.cbrt(column.cast[Double].untyped)) /** Non-Aggregate function: returns the exponential value of a numeric column. * * apache/spark */ def exp[A, T](column: AbstractTypedColumn[T, A]) (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] = column.typed(sparkFunctions.exp(column.cast[Double].untyped)) /** Non-Aggregate function: Returns the value of the column `e` rounded to 0 decimal places with HALF_UP round mode. * * apache/spark */ def round[A, B, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.round(column.untyped))(i1) /** Non-Aggregate function: Round the value of `e` to `scale` decimal places with HALF_UP round mode * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0. * * apache/spark */ def round[A, B, T](column: AbstractTypedColumn[T, A], scale: Int)( implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.round(column.untyped, scale))(i1) /** Non-Aggregate function: Bankers Rounding - returns the rounded to 0 decimal places value with HALF_EVEN round mode * of a numeric column. * * apache/spark */ def bround[A, B, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.bround(column.untyped))(i1) /** Non-Aggregate function: Bankers Rounding - returns the rounded to `scale` decimal places value with HALF_EVEN round mode * of a numeric column. If `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0. * * apache/spark */ def bround[A, B, T](column: AbstractTypedColumn[T, A], scale: Int)( implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B] ): column.ThisType[T, B] = column.typed(sparkFunctions.bround(column.untyped, scale))(i1) /** * Computes the natural logarithm of the given value. * * apache/spark */ def log[A, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.log(column.untyped)) /** * Returns the first argument-base logarithm of the second argument. * * apache/spark */ def log[A, T](base: Double, column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.log(base, column.untyped)) /** * Computes the logarithm of the given column in base 2. * * apache/spark */ def log2[A, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.log2(column.untyped)) /** * Computes the natural logarithm of the given value plus one. * * apache/spark */ def log1p[A, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.log1p(column.untyped)) /** * Computes the logarithm of the given column in base 10. * * apache/spark */ def log10[A, T](column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.log10(column.untyped)) /** * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. * * apache/spark */ def hypot[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.hypot(column.untyped, column2.untyped)) /** * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. * * apache/spark */ def hypot[A, T](column: AbstractTypedColumn[T, A], l: Double)( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.hypot(column.untyped, l)) /** * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. * * apache/spark */ def hypot[A, T](l: Double, column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.hypot(l, column.untyped)) /** * Returns the value of the first argument raised to the power of the second argument. * * apache/spark */ def pow[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.pow(column.untyped, column2.untyped)) /** * Returns the value of the first argument raised to the power of the second argument. * * apache/spark */ def pow[A, T](column: AbstractTypedColumn[T, A], l: Double)( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.pow(column.untyped, l)) /** * Returns the value of the first argument raised to the power of the second argument. * * apache/spark */ def pow[A, T](l: Double, column: AbstractTypedColumn[T, A])( implicit i0: CatalystCast[A, Double] ): column.ThisType[T, Double] = column.typed(sparkFunctions.pow(l, column.untyped)) /** * Returns the positive value of dividend mod divisor. * * apache/spark */ def pmod[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])( implicit i0: TypedEncoder[A] ): column.ThisType[T, A] = column.typed(sparkFunctions.pmod(column.untyped, column2.untyped)) /** Non-Aggregate function: Returns the string representation of the binary value of the given long * column. For example, bin("12") returns "1100". * * apache/spark */ def bin[T](column: AbstractTypedColumn[T, Long]): column.ThisType[T, String] = column.typed(sparkFunctions.bin(column.untyped)) /** * Calculates the MD5 digest of a binary column and returns the value * as a 32 character hex string. * * apache/spark */ def md5[T, A](column: AbstractTypedColumn[T, A])(implicit i0: TypedEncoder[A]): column.ThisType[T, String] = column.typed(sparkFunctions.md5(column.untyped)) /** * Computes the factorial of the given value. * * apache/spark */ def factorial[T](column: AbstractTypedColumn[T, Long])(implicit i0: TypedEncoder[Long]): column.ThisType[T, Long] = column.typed(sparkFunctions.factorial(column.untyped)) /** Non-Aggregate function: Computes bitwise NOT. * * apache/spark */ @nowarn // supress sparkFunctions.bitwiseNOT call which is used to maintain Spark 3.1.x backwards compat def bitwiseNOT[A: CatalystBitwise, T](column: AbstractTypedColumn[T, A]): column.ThisType[T, A] = column.typed(sparkFunctions.bitwiseNOT(column.untyped))(column.uencoder) /** Non-Aggregate function: file name of the current Spark task. Empty string if row did not originate from * a file * * apache/spark */ def inputFileName[T](): TypedColumn[T, String] = new TypedColumn[T, String](sparkFunctions.input_file_name()) /** Non-Aggregate function: generates monotonically increasing id * * apache/spark */ def monotonicallyIncreasingId[T](): TypedColumn[T, Long] = { new TypedColumn[T, Long](sparkFunctions.monotonically_increasing_id()) } /** Non-Aggregate function: Evaluates a list of conditions and returns one of multiple * possible result expressions. If none match, otherwise is returned * {{{ * when(ds('boolField), ds('a)) * .when(ds('otherBoolField), lit(123)) * .otherwise(ds('b)) * }}} * apache/spark */ def when[T, A](condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]): When[T, A] = new When[T, A](condition, value) class When[T, A] private (untypedC: Column) { private[functions] def this(condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]) = this(sparkFunctions.when(condition.untyped, value.untyped)) def when(condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]): When[T, A] = new When[T, A](untypedC.when(condition.untyped, value.untyped)) def otherwise(value: AbstractTypedColumn[T, A]): value.ThisType[T, A] = value.typed(untypedC.otherwise(value.untyped))(value.uencoder) } ////////////////////////////////////////////////////////////////////////////////////////////// // String functions ////////////////////////////////////////////////////////////////////////////////////////////// /** Non-Aggregate function: takes the first letter of a string column and returns the ascii int value in a new column * * apache/spark */ def ascii[T](column: AbstractTypedColumn[T, String]): column.ThisType[T, Int] = column.typed(sparkFunctions.ascii(column.untyped)) /** Non-Aggregate function: Computes the BASE64 encoding of a binary column and returns it as a string column. * This is the reverse of unbase64. * * apache/spark */ def base64[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, String] = column.typed(sparkFunctions.base64(column.untyped)) /** Non-Aggregate function: Decodes a BASE64 encoded string column and returns it as a binary column. * This is the reverse of base64. * * apache/spark */ def unbase64[T](column: AbstractTypedColumn[T, String]): column.ThisType[T, Array[Byte]] = column.typed(sparkFunctions.unbase64(column.untyped)) /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column. * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]] * * apache/spark */ def concat[T](columns: TypedColumn[T, String]*): TypedColumn[T, String] = new TypedColumn(sparkFunctions.concat(columns.map(_.untyped): _*)) /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column. * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]] * * apache/spark */ def concat[T](columns: TypedAggregate[T, String]*): TypedAggregate[T, String] = new TypedAggregate(sparkFunctions.concat(columns.map(_.untyped): _*)) /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column, * using the given separator. * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]] * * apache/spark */ def concatWs[T](sep: String, columns: TypedAggregate[T, String]*): TypedAggregate[T, String] = new TypedAggregate(sparkFunctions.concat_ws(sep, columns.map(_.untyped): _*)) /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column, * using the given separator. * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]] * * apache/spark */ def concatWs[T](sep: String, columns: TypedColumn[T, String]*): TypedColumn[T, String] = new TypedColumn(sparkFunctions.concat_ws(sep, columns.map(_.untyped): _*)) /** Non-Aggregate function: Locates the position of the first occurrence of substring column * in given string * * @note The position is not zero based, but 1 based index. Returns 0 if substr * could not be found in str. * * apache/spark */ def instr[T](str: AbstractTypedColumn[T, String], substring: String): str.ThisType[T, Int] = str.typed(sparkFunctions.instr(str.untyped, substring)) /** Non-Aggregate function: Computes the length of a given string. * * apache/spark */ //TODO: Also for binary def length[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Int] = str.typed(sparkFunctions.length(str.untyped)) /** Non-Aggregate function: Computes the Levenshtein distance of the two given string columns. * * apache/spark */ def levenshtein[T](l: TypedColumn[T, String], r: TypedColumn[T, String]): TypedColumn[T, Int] = l.typed(sparkFunctions.levenshtein(l.untyped, r.untyped)) /** Non-Aggregate function: Computes the Levenshtein distance of the two given string columns. * * apache/spark */ def levenshtein[T](l: TypedAggregate[T, String], r: TypedAggregate[T, String]): TypedAggregate[T, Int] = l.typed(sparkFunctions.levenshtein(l.untyped, r.untyped)) /** Non-Aggregate function: Converts a string column to lower case. * * apache/spark */ def lower[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.lower(str.untyped)) /** Non-Aggregate function: Left-pad the string column with pad to a length of len. If the string column is longer * than len, the return value is shortened to len characters. * * apache/spark */ def lpad[T](str: AbstractTypedColumn[T, String], len: Int, pad: String): str.ThisType[T, String] = str.typed(sparkFunctions.lpad(str.untyped, len, pad)) /** Non-Aggregate function: Trim the spaces from left end for the specified string value. * * apache/spark */ def ltrim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.ltrim(str.untyped)) /** Non-Aggregate function: Replace all substrings of the specified string value that match regexp with rep. * * apache/spark */ def regexpReplace[T](str: AbstractTypedColumn[T, String], pattern: Regex, replacement: String): str.ThisType[T, String] = str.typed(sparkFunctions.regexp_replace(str.untyped, pattern.regex, replacement)) /** Non-Aggregate function: Reverses the string column and returns it as a new string column. * * apache/spark */ def reverse[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.reverse(str.untyped)) /** Non-Aggregate function: Right-pad the string column with pad to a length of len. * If the string column is longer than len, the return value is shortened to len characters. * * apache/spark */ def rpad[T](str: AbstractTypedColumn[T, String], len: Int, pad: String): str.ThisType[T, String] = str.typed(sparkFunctions.rpad(str.untyped, len, pad)) /** Non-Aggregate function: Trim the spaces from right end for the specified string value. * * apache/spark */ def rtrim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.rtrim(str.untyped)) /** Non-Aggregate function: Substring starts at `pos` and is of length `len` * * apache/spark */ //TODO: Also for byte array def substring[T](str: AbstractTypedColumn[T, String], pos: Int, len: Int): str.ThisType[T, String] = str.typed(sparkFunctions.substring(str.untyped, pos, len)) /** Non-Aggregate function: Trim the spaces from both ends for the specified string column. * * apache/spark */ def trim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.trim(str.untyped)) /** Non-Aggregate function: Converts a string column to upper case. * * apache/spark */ def upper[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] = str.typed(sparkFunctions.upper(str.untyped)) ////////////////////////////////////////////////////////////////////////////////////////////// // DateTime functions ////////////////////////////////////////////////////////////////////////////////////////////// /** Non-Aggregate function: Extracts the year as an integer from a given date/timestamp/string. * * Differs from `Column#year` by wrapping it's result into an `Option`. * * apache/spark */ def year[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.year(str.untyped)) /** Non-Aggregate function: Extracts the quarter as an integer from a given date/timestamp/string. * * Differs from `Column#quarter` by wrapping it's result into an `Option`. * * apache/spark */ def quarter[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.quarter(str.untyped)) /** Non-Aggregate function Extracts the month as an integer from a given date/timestamp/string. * * Differs from `Column#month` by wrapping it's result into an `Option`. * * apache/spark */ def month[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.month(str.untyped)) /** Non-Aggregate function: Extracts the day of the week as an integer from a given date/timestamp/string. * * Differs from `Column#dayofweek` by wrapping it's result into an `Option`. * * apache/spark */ def dayofweek[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.dayofweek(str.untyped)) /** Non-Aggregate function: Extracts the day of the month as an integer from a given date/timestamp/string. * * Differs from `Column#dayofmonth` by wrapping it's result into an `Option`. * * apache/spark */ def dayofmonth[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.dayofmonth(str.untyped)) /** Non-Aggregate function: Extracts the day of the year as an integer from a given date/timestamp/string. * * Differs from `Column#dayofyear` by wrapping it's result into an `Option`. * * apache/spark */ def dayofyear[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.dayofyear(str.untyped)) /** Non-Aggregate function: Extracts the hours as an integer from a given date/timestamp/string. * * Differs from `Column#hour` by wrapping it's result into an `Option`. * * apache/spark */ def hour[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.hour(str.untyped)) /** Non-Aggregate function: Extracts the minutes as an integer from a given date/timestamp/string. * * Differs from `Column#minute` by wrapping it's result into an `Option`. * * apache/spark */ def minute[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.minute(str.untyped)) /** Non-Aggregate function: Extracts the seconds as an integer from a given date/timestamp/string. * * Differs from `Column#second` by wrapping it's result into an `Option`. * * apache/spark */ def second[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.second(str.untyped)) /** Non-Aggregate function: Extracts the week number as an integer from a given date/timestamp/string. * * Differs from `Column#weekofyear` by wrapping it's result into an `Option`. * * apache/spark */ def weekofyear[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] = str.typed(sparkFunctions.weekofyear(str.untyped)) } ================================================ FILE: dataset/src/main/scala/frameless/functions/Udf.scala ================================================ package frameless package functions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, NonSQLExpression} import org.apache.spark.sql.catalyst.expressions.codegen._ import Block._ import org.apache.spark.sql.types.DataType import shapeless.syntax.std.tuple._ /** Documentation marked "apache/spark" is thanks to apache/spark Contributors * at https://github.com/apache/spark, licensed under Apache v2.0 available at * http://www.apache.org/licenses/LICENSE-2.0 */ trait Udf { /** Defines a user-defined function of 1 arguments as user-defined function (UDF). * The data types are automatically inferred based on the function's signature. * * apache/spark */ def udf[T, A, R: TypedEncoder](f: A => R): TypedColumn[T, A] => TypedColumn[T, R] = { u => val scalaUdf = FramelessUdf(f, List(u), TypedEncoder[R]) new TypedColumn[T, R](scalaUdf) } /** Defines a user-defined function of 2 arguments as user-defined function (UDF). * The data types are automatically inferred based on the function's signature. * * apache/spark */ def udf[T, A1, A2, R: TypedEncoder](f: (A1,A2) => R): (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = { case us => val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R]) new TypedColumn[T, R](scalaUdf) } /** Defines a user-defined function of 3 arguments as user-defined function (UDF). * The data types are automatically inferred based on the function's signature. * * apache/spark */ def udf[T, A1, A2, A3, R: TypedEncoder](f: (A1,A2,A3) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = { case us => val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R]) new TypedColumn[T, R](scalaUdf) } /** Defines a user-defined function of 4 arguments as user-defined function (UDF). * The data types are automatically inferred based on the function's signature. * * apache/spark */ def udf[T, A1, A2, A3, A4, R: TypedEncoder](f: (A1,A2,A3,A4) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = { case us => val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R]) new TypedColumn[T, R](scalaUdf) } /** Defines a user-defined function of 5 arguments as user-defined function (UDF). * The data types are automatically inferred based on the function's signature. * * apache/spark */ def udf[T, A1, A2, A3, A4, A5, R: TypedEncoder](f: (A1,A2,A3,A4,A5) => R): (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = { case us => val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R]) new TypedColumn[T, R](scalaUdf) } } /** * NB: Implementation detail, isn't intended to be directly used. * * Our own implementation of `ScalaUDF` from Catalyst compatible with [[TypedEncoder]]. */ case class FramelessUdf[T, R]( function: AnyRef, encoders: Seq[TypedEncoder[_]], children: Seq[Expression], rencoder: TypedEncoder[R] ) extends Expression with NonSQLExpression { override def nullable: Boolean = rencoder.nullable override def toString: String = s"FramelessUdf(${children.mkString(", ")})" lazy val evalCode = { val ctx = new CodegenContext() val eval = genCode(ctx) val codeBody = s""" public scala.Function1 generate(Object[] references) { return new FramelessUdfEvalImpl(references); } class FramelessUdfEvalImpl extends scala.runtime.AbstractFunction1 { private final Object[] references; ${ctx.declareMutableStates()} ${ctx.declareAddedFunctions()} public FramelessUdfEvalImpl(Object[] references) { this.references = references; ${ctx.initMutableStates()} } public java.lang.Object apply(java.lang.Object z) { InternalRow ${ctx.INPUT_ROW} = (InternalRow) z; ${eval.code} return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value}); } } """ val code = CodeFormatter.stripOverlappingComments( new CodeAndComment(codeBody, ctx.getPlaceHolderToComments())) val (clazz, _) = CodeGenerator.compile(code) val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef] codegen } def eval(input: InternalRow): Any = { evalCode(input) } def dataType: DataType = rencoder.catalystRepr override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ctx.references += this // save reference to `function` field from `FramelessUdf` to call it later val framelessUdfClassName = classOf[FramelessUdf[_, _]].getName val funcClassName = s"scala.Function${children.size}" val funcExpressionIdx = ctx.references.size - 1 val funcTerm = ctx.addMutableState(funcClassName, ctx.freshName("udf"), v => s"$v = ($funcClassName)((($framelessUdfClassName)references" + s"[$funcExpressionIdx]).function());") val (argsCode, funcArguments) = encoders.zip(children).map { case (encoder, child) => val eval = child.genCode(ctx) val codeTpe = CodeGenerator.boxedType(encoder.jvmRepr) val argTerm = ctx.freshName("arg") val convert = s"${eval.code}\n$codeTpe $argTerm = ${eval.isNull} ? (($codeTpe)null) : (($codeTpe)(${eval.value}));" (convert, argTerm) }.unzip val internalTpe = CodeGenerator.boxedType(rencoder.jvmRepr) val internalTerm = ctx.addMutableState(internalTpe, ctx.freshName("internal")) val internalNullTerm = ctx.addMutableState("boolean", ctx.freshName("internalNull")) // CTw - can't inject the term, may have to duplicate old code for parity val internalExpr = Spark2_4_LambdaVariable(internalTerm, internalNullTerm, rencoder.jvmRepr, true) val resultEval = rencoder.toCatalyst(internalExpr).genCode(ctx) ev.copy(code = code""" ${argsCode.mkString("\n")} $internalTerm = ($internalTpe)$funcTerm.apply(${funcArguments.mkString(", ")}); $internalNullTerm = $internalTerm == null; ${resultEval.code} """, value = resultEval.value, isNull = resultEval.isNull ) } protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(children = newChildren) } case class Spark2_4_LambdaVariable( value: String, isNull: String, dataType: DataType, nullable: Boolean = true) extends LeafExpression with NonSQLExpression { private val accessor: (InternalRow, Int) => Any = InternalRow.getAccessor(dataType) // Interpreted execution of `LambdaVariable` always get the 0-index element from input row. override def eval(input: InternalRow): Any = { assert(input.numFields == 1, "The input row of interpreted LambdaVariable should have only 1 field.") if (nullable && input.isNullAt(0)) { null } else { accessor(input, 0) } } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val isNullValue = if (nullable) { JavaCode.isNullVariable(isNull) } else { FalseLiteral } ExprCode(value = JavaCode.variable(value, dataType), isNull = isNullValue) } } object FramelessUdf { // Spark needs case class with `children` field to mutate it def apply[T, R]( function: AnyRef, cols: Seq[UntypedExpression[T]], rencoder: TypedEncoder[R] ): FramelessUdf[T, R] = FramelessUdf( function = function, encoders = cols.map(_.uencoder).toList, children = cols.map(x => x.uencoder.fromCatalyst(x.expr)).toList, rencoder = rencoder ) } ================================================ FILE: dataset/src/main/scala/frameless/functions/UnaryFunctions.scala ================================================ package frameless package functions import org.apache.spark.sql.{Column, functions => sparkFunctions} import scala.math.Ordering trait UnaryFunctions { /** Returns length of array * * apache/spark */ def size[T, A, V[_] : CatalystSizableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, Int] = new TypedColumn[T, Int](implicitly[CatalystSizableCollection[V]].sizeOp(column.untyped)) /** Returns length of Map * * apache/spark */ def size[T, A, B](column: TypedColumn[T, Map[A, B]]): TypedColumn[T, Int] = new TypedColumn[T, Int](sparkFunctions.size(column.untyped)) /** Sorts the input array for the given column in ascending order, according to * the natural ordering of the array elements. * * apache/spark */ def sortAscending[T, A: Ordering, V[_] : CatalystSortableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, V[A]] = new TypedColumn[T, V[A]](implicitly[CatalystSortableCollection[V]].sortOp(column.untyped, sortAscending = true))(column.uencoder) /** Sorts the input array for the given column in descending order, according to * the natural ordering of the array elements. * * apache/spark */ def sortDescending[T, A: Ordering, V[_] : CatalystSortableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, V[A]] = new TypedColumn[T, V[A]](implicitly[CatalystSortableCollection[V]].sortOp(column.untyped, sortAscending = false))(column.uencoder) /** Creates a new row for each element in the given collection. The column types * eligible for this operation are constrained by CatalystExplodableCollection. * * apache/spark */ @deprecated("Use explode() from the TypedDataset instead. This method will result in " + "runtime error if applied to two columns in the same select statement.", "0.6.2") def explode[T, A: TypedEncoder, V[_] : CatalystExplodableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, A] = new TypedColumn[T, A](sparkFunctions.explode(column.untyped)) } trait CatalystSizableCollection[V[_]] { def sizeOp(col: Column): Column } object CatalystSizableCollection { implicit def sizableVector: CatalystSizableCollection[Vector] = new CatalystSizableCollection[Vector] { def sizeOp(col: Column): Column = sparkFunctions.size(col) } implicit def sizableArray: CatalystSizableCollection[Array] = new CatalystSizableCollection[Array] { def sizeOp(col: Column): Column = sparkFunctions.size(col) } implicit def sizableList: CatalystSizableCollection[List] = new CatalystSizableCollection[List] { def sizeOp(col: Column): Column = sparkFunctions.size(col) } } trait CatalystExplodableCollection[V[_]] object CatalystExplodableCollection { implicit def explodableVector: CatalystExplodableCollection[Vector] = new CatalystExplodableCollection[Vector] {} implicit def explodableArray: CatalystExplodableCollection[Array] = new CatalystExplodableCollection[Array] {} implicit def explodableList: CatalystExplodableCollection[List] = new CatalystExplodableCollection[List] {} implicit def explodableSeq: CatalystExplodableCollection[Seq] = new CatalystExplodableCollection[Seq] {} } trait CatalystSortableCollection[V[_]] { def sortOp(col: Column, sortAscending: Boolean): Column } object CatalystSortableCollection { implicit def sortableVector: CatalystSortableCollection[Vector] = new CatalystSortableCollection[Vector] { def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending) } implicit def sortableArray: CatalystSortableCollection[Array] = new CatalystSortableCollection[Array] { def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending) } implicit def sortableList: CatalystSortableCollection[List] = new CatalystSortableCollection[List] { def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending) } } ================================================ FILE: dataset/src/main/scala/frameless/functions/package.scala ================================================ package frameless import scala.reflect.ClassTag import shapeless._ import shapeless.labelled.FieldType import shapeless.ops.hlist.IsHCons import shapeless.ops.record.{ Keys, Values } import org.apache.spark.sql.{ reflection => ScalaReflection } import org.apache.spark.sql.catalyst.expressions.Literal package object functions extends Udf with UnaryFunctions { object aggregate extends AggregateFunctions object nonAggregate extends NonAggregateFunctions /** * Creates a [[frameless.TypedAggregate]] of literal value. If A is to be encoded using an Injection make * sure the injection instance is in scope. * * apache/spark */ def litAggr[A, T]( value: A )(implicit i0: TypedEncoder[A], i1: Refute[IsValueClass[A]] ): TypedAggregate[T, A] = new TypedAggregate[T, A](lit(value).expr) /** * Creates a [[frameless.TypedColumn]] of literal value. If A is to be encoded using an Injection make * sure the injection instance is in scope. * * apache/spark * * @tparam A the literal value type * @tparam T the row type */ def lit[A, T]( value: A )(implicit encoder: TypedEncoder[A] ): TypedColumn[T, A] = { if ( ScalaReflection.isNativeType( encoder.jvmRepr ) && encoder.catalystRepr == encoder.jvmRepr ) { val expr = Literal(value, encoder.catalystRepr) new TypedColumn(expr) } else { val expr = new Literal(value, encoder.jvmRepr) new TypedColumn[T, A]( Lit( dataType = encoder.catalystRepr, nullable = encoder.nullable, show = () => value.toString, catalystExpr = encoder.toCatalyst(expr) ) ) } } /** * Creates a [[frameless.TypedColumn]] of literal value * for a Value class `A`. * * @tparam A the value class * @tparam T the row type */ def litValue[ A: IsValueClass, T, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil], VS <: HList ](value: A )(implicit i0: LabelledGeneric.Aux[A, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil], i3: Keys.Aux[H, KS], i4: Values.Aux[H, VS], i5: IsHCons.Aux[KS, K, HNil], i6: IsHCons.Aux[VS, V, HNil], i7: TypedEncoder[V], i8: ClassTag[A] ): TypedColumn[T, A] = { val expr = { val field: H = i1(i0.to(value)) val v: V = i6.head(i4(field)) new Literal(v, i7.jvmRepr) } implicit val enc: TypedEncoder[A] = RecordFieldEncoder.valueClass[A, G, H, K, V, KS].encoder new TypedColumn[T, A]( Lit( dataType = i7.catalystRepr, nullable = i7.nullable, show = () => value.toString, i7.toCatalyst(expr) ) ) } /** * Creates a [[frameless.TypedColumn]] of literal value * for an optional Value class `A`. * * @tparam A the value class * @tparam T the row type */ def litValue[ A: IsValueClass, T, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil], VS <: HList ](value: Option[A] )(implicit i0: LabelledGeneric.Aux[A, G], i1: DropUnitValues.Aux[G, H], i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil], i3: Keys.Aux[H, KS], i4: Values.Aux[H, VS], i5: IsHCons.Aux[KS, K, HNil], i6: IsHCons.Aux[VS, V, HNil], i7: TypedEncoder[V], i8: ClassTag[A] ): TypedColumn[T, Option[A]] = { val expr = value match { case Some(some) => { val field: H = i1(i0.to(some)) val v: V = i6.head(i4(field)) new Literal(v, i7.jvmRepr) } case _ => Literal.create(null, i7.jvmRepr) } implicit val enc: TypedEncoder[A] = RecordFieldEncoder.valueClass[A, G, H, K, V, KS].encoder new TypedColumn[T, Option[A]]( Lit( dataType = i7.catalystRepr, nullable = true, show = () => value.toString, i7.toCatalyst(expr) ) ) } } ================================================ FILE: dataset/src/main/scala/frameless/ops/AggregateTypes.scala ================================================ package frameless package ops import shapeless._ /** A type class to extract the column types out of an HList of [[frameless.TypedAggregate]]. * * @note This type class is mostly a workaround to issue with slow implicit derivation for Comapped. * @example * {{{ * type U = TypedAggregate[T,A] :: TypedAggregate[T,B] :: TypedAggregate[T,C] :: HNil * type Out = A :: B :: C :: HNil * }}} */ trait AggregateTypes[V, U <: HList] { type Out <: HList } object AggregateTypes { type Aux[V, U <: HList, Out0 <: HList] = AggregateTypes[V, U] {type Out = Out0} implicit def deriveHNil[T]: AggregateTypes.Aux[T, HNil, HNil] = new AggregateTypes[T, HNil] { type Out = HNil } implicit def deriveCons1[T, H, TT <: HList, V <: HList]( implicit tail: AggregateTypes.Aux[T, TT, V] ): AggregateTypes.Aux[T, TypedAggregate[T, H] :: TT, H :: V] = new AggregateTypes[T, TypedAggregate[T, H] :: TT] {type Out = H :: V} } ================================================ FILE: dataset/src/main/scala/frameless/ops/As.scala ================================================ package frameless package ops import shapeless.{::, Generic, HList, Lazy} /** Evidence for correctness of `TypedDataset[T].as[U]` */ class As[T, U] private (implicit val encoder: TypedEncoder[U]) object As extends LowPriorityAs { final class Equiv[A, B] private[ops] () implicit def equivIdentity[A] = new Equiv[A, A] implicit def deriveAs[A, B] (implicit i0: TypedEncoder[B], i1: Equiv[A, B] ): As[A, B] = new As[A, B] } trait LowPriorityAs { import As.Equiv implicit def equivHList[AH, AT <: HList, BH, BT <: HList] (implicit i0: Lazy[Equiv[AH, BH]], i1: Equiv[AT, BT] ): Equiv[AH :: AT, BH :: BT] = new Equiv[AH :: AT, BH :: BT] implicit def equivGeneric[A, B, R, S] (implicit i0: Generic.Aux[A, R], i1: Generic.Aux[B, S], i2: Lazy[Equiv[R, S]] ): Equiv[A, B] = new Equiv[A, B] } ================================================ FILE: dataset/src/main/scala/frameless/ops/ColumnTypes.scala ================================================ package frameless package ops import shapeless._ /** A type class to extract the column types out of an HList of [[frameless.TypedColumn]]. * * @note This type class is mostly a workaround to issue with slow implicit derivation for Comapped. * @example * {{{ * type U = TypedColumn[T,A] :: TypedColumn[T,B] :: TypedColumn[T,C] :: HNil * type Out = A :: B :: C :: HNil * }}} */ trait ColumnTypes[T, U <: HList] { type Out <: HList } object ColumnTypes { type Aux[T, U <: HList, Out0 <: HList] = ColumnTypes[T, U] {type Out = Out0} implicit def deriveHNil[T]: ColumnTypes.Aux[T, HNil, HNil] = new ColumnTypes[T, HNil] { type Out = HNil } implicit def deriveCons[T, H, TT <: HList, V <: HList]( implicit tail: ColumnTypes.Aux[T, TT, V] ): ColumnTypes.Aux[T, TypedColumn[T, H] :: TT, H :: V] = new ColumnTypes[T, TypedColumn[T, H] :: TT] {type Out = H :: V} } ================================================ FILE: dataset/src/main/scala/frameless/ops/GroupByOps.scala ================================================ package frameless package ops import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.{Column, Dataset, FramelessInternals, RelationalGroupedDataset} import shapeless._ import shapeless.ops.hlist.{Length, Mapped, Prepend, ToList, ToTraversable, Tupler} class GroupedByManyOps[T, TK <: HList, K <: HList, KT] (self: TypedDataset[T], groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], i3: Tupler.Aux[K, KT] ) extends AggregatingOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.groupBy(cols: _*)) { object agg extends ProductArgs { def applyProduct[TC <: HList, C <: HList, Out0 <: HList, Out1] (columns: TC) (implicit i3: AggregateTypes.Aux[T, TC, C], i4: Prepend.Aux[K, C, Out0], i5: Tupler.Aux[Out0, Out1], i6: TypedEncoder[Out1], i7: ToTraversable.Aux[TC, List, UntypedExpression[T]] ): TypedDataset[Out1] = { aggregate[TC, Out1](columns) } } } class GroupedBy1Ops[K1, V]( self: TypedDataset[V], g1: TypedColumn[V, K1] ) { private def underlying = new GroupedByManyOps(self, g1 :: HNil) private implicit def eg1 = g1.uencoder def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(K1, U1)] = { implicit val e1 = c1.uencoder underlying.agg(c1) } def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, U1, U2, U3)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, U1, U2, U3, U4)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder underlying.agg(c1, c2, c3, c4) } def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, U1, U2, U3, U4, U5)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. */ object deserialized { def mapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => U): TypedDataset[U] = { underlying.deserialized.mapGroups(AggregatingOps.tuple1(f)) } def flatMapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { underlying.deserialized.flatMapGroups(AggregatingOps.tuple1(f)) } } def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: HNil, P] = PivotNotValues(self, g1 :: HNil, pivotColumn) } class GroupedBy2Ops[K1, K2, V]( self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2] ) { private def underlying = new GroupedByManyOps(self, g1 :: g2 :: HNil) private implicit def eg1 = g1.uencoder private implicit def eg2 = g2.uencoder def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(K1, K2, U1)] = { implicit val e1 = c1.uencoder underlying.agg(c1) } def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, K2, U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, K2, U1, U2, U3)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, K2, U1, U2, U3, U4)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder underlying.agg(c1 , c2 , c3 , c4) } def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, K2, U1, U2, U3, U4, U5)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. */ object deserialized { def mapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => U): TypedDataset[U] = { underlying.deserialized.mapGroups(f) } def flatMapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { underlying.deserialized.flatMapGroups(f) } } def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: TypedColumn[V, K2] :: HNil, P] = PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn) } private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT] (self: TypedDataset[T], groupedBy: TK, groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], i2: Tupler.Aux[K, KT] ) { def aggregate[TC <: HList, Out1](columns: TC) (implicit i7: TypedEncoder[Out1], i8: ToTraversable.Aux[TC, List, UntypedExpression[T]] ): TypedDataset[Out1] = { def expr(c: UntypedExpression[T]): Column = new Column(c.expr) val groupByExprs = groupedBy.toList[UntypedExpression[T]].map(expr) val aggregates = if (retainGroupColumns) columns.toList[UntypedExpression[T]].map(expr) else groupByExprs ++ columns.toList[UntypedExpression[T]].map(expr) val aggregated = groupingFunc(self.dataset, groupByExprs) .agg(aggregates.head, aggregates.tail: _*) .as[Out1](TypedExpressionEncoder[Out1]) TypedDataset.create[Out1](aggregated) } /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. */ object deserialized { def mapGroups[U: TypedEncoder]( f: (KT, Iterator[T]) => U )(implicit e: TypedEncoder[KT]): TypedDataset[U] = { val func = (key: KT, it: Iterator[T]) => Iterator(f(key, it)) flatMapGroups(func) } def flatMapGroups[U: TypedEncoder]( f: (KT, Iterator[T]) => TraversableOnce[U] )(implicit e: TypedEncoder[KT]): TypedDataset[U] = { implicit val tendcoder = self.encoder val cols = groupedBy.toList[UntypedExpression[T]] val logicalPlan = FramelessInternals.logicalPlan(self.dataset) val withKeyColumns = logicalPlan.output ++ cols.map(_.expr).map(UnresolvedAlias(_)) val withKey = Project(withKeyColumns, logicalPlan) val executed = FramelessInternals.executePlan(self.dataset, withKey) val keyAttributes = executed.analyzed.output.takeRight(cols.size) val dataAttributes = executed.analyzed.output.dropRight(cols.size) val mapGroups = MapGroups( f, keyAttributes, dataAttributes, executed.analyzed )(TypedExpressionEncoder[KT], TypedExpressionEncoder[T], TypedExpressionEncoder[U]) val groupedAndFlatMapped = FramelessInternals.mkDataset( self.dataset.sqlContext, mapGroups, TypedExpressionEncoder[U] ) TypedDataset.create(groupedAndFlatMapped) } } private def retainGroupColumns: Boolean = { self.dataset.sqlContext.getConf("spark.sql.retainGroupColumns", "true").toBoolean } def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[T, P]): PivotNotValues[T, TK, P] = PivotNotValues(self, groupedBy, pivotColumn) } private[ops] object AggregatingOps { /** Utility function to help Spark with serialization of closures */ def tuple1[K1, V, U](f: (K1, Iterator[V]) => U): (Tuple1[K1], Iterator[V]) => U = { (x: Tuple1[K1], it: Iterator[V]) => f(x._1, it) } } /** Represents a typed Pivot operation. */ final case class Pivot[T, GroupedColumns <: HList, PivotType, Values <: HList]( ds: TypedDataset[T], groupedBy: GroupedColumns, pivotedBy: TypedColumn[T, PivotType], values: Values ) { object agg extends ProductArgs { def applyProduct[AggrColumns <: HList, AggrColumnTypes <: HList, GroupedColumnTypes <: HList, NumValues <: Nat, TypesForPivotedValues <: HList, TypesForPivotedValuesOpt <: HList, OutAsHList <: HList, Out] (aggrColumns: AggrColumns) (implicit i0: AggregateTypes.Aux[T, AggrColumns, AggrColumnTypes], i1: ColumnTypes.Aux[T, GroupedColumns, GroupedColumnTypes], i2: Length.Aux[Values, NumValues], i3: Repeat.Aux[AggrColumnTypes, NumValues, TypesForPivotedValues], i4: Mapped.Aux[TypesForPivotedValues, Option, TypesForPivotedValuesOpt], i5: Prepend.Aux[GroupedColumnTypes, TypesForPivotedValuesOpt, OutAsHList], i6: Tupler.Aux[OutAsHList, Out], i7: TypedEncoder[Out] ): TypedDataset[Out] = { def mapAny[X](h: HList)(f: Any => X): List[X] = h match { case HNil => Nil case x :: xs => f(x) :: mapAny(xs)(f) } val aggCols: Seq[Column] = mapAny(aggrColumns)(x => new Column(x.asInstanceOf[TypedAggregate[_,_]].expr)) val tmp = ds.dataset.toDF() .groupBy(mapAny(groupedBy)(_.asInstanceOf[TypedColumn[_, _]].untyped): _*) .pivot(pivotedBy.untyped.toString, mapAny(values)(identity)) .agg(aggCols.head, aggCols.tail:_*) .as[Out](TypedExpressionEncoder[Out]) TypedDataset.create(tmp) } } } final case class PivotNotValues[T, GroupedColumns <: HList, PivotType]( ds: TypedDataset[T], groupedBy: GroupedColumns, pivotedBy: TypedColumn[T, PivotType] ) extends ProductArgs { def onProduct[Values <: HList](values: Values)( implicit validValues: ToList[Values, PivotType] // validValues: FilterNot.Aux[Values, PivotType, HNil] // did not work ): Pivot[T, GroupedColumns, PivotType, Values] = Pivot(ds, groupedBy, pivotedBy, values) } ================================================ FILE: dataset/src/main/scala/frameless/ops/RelationalGroupsOps.scala ================================================ package frameless package ops import org.apache.spark.sql.{Column, Dataset, RelationalGroupedDataset} import shapeless.ops.hlist.{Mapped, Prepend, ToTraversable, Tupler} import shapeless.{::, HList, HNil, ProductArgs} /** * @param groupingFunc functions used to group elements, can be cube or rollup * @tparam T the original `TypedDataset's` type T * @tparam TK all columns chosen for aggregation * @tparam K individual columns' types as HList * @tparam KT individual columns' types as Tuple */ private[ops] abstract class RelationalGroupsOps[T, TK <: HList, K <: HList, KT] (self: TypedDataset[T], groupedBy: TK, groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], i2: Tupler.Aux[K, KT] ) extends AggregatingOps(self, groupedBy, groupingFunc){ object agg extends ProductArgs { /** * @tparam TC resulting columns after aggregation function * @tparam C individual columns' types as HList * @tparam OptK columns' types mapped to Option * @tparam Out0 OptK columns appended to C * @tparam Out1 output type */ def applyProduct[TC <: HList, C <: HList, OptK <: HList, Out0 <: HList, Out1] (columns: TC) (implicit i3: AggregateTypes.Aux[T, TC, C], // shares individual columns' types after agg function as HList i4: Mapped.Aux[K, Option, OptK], // maps all original columns' types to Option i5: Prepend.Aux[OptK, C, Out0], // concatenates Option columns with those resulting from applying agg function i6: Tupler.Aux[Out0, Out1], // converts resulting HList into Tuple for output type i7: TypedEncoder[Out1], // proof that there is `TypedEncoder` for the output type i8: ToTraversable.Aux[TC, List, UntypedExpression[T]] // allows converting this HList to ordinary List ): TypedDataset[Out1] = { aggregate[TC, Out1](columns) } } } private[ops] abstract class RelationalGroups1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) { protected def underlying: RelationalGroupsOps[V, ::[TypedColumn[V, K1], HNil], ::[K1, HNil], Tuple1[K1]] private implicit def eg1 = g1.uencoder def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(Option[K1], U1)] = { implicit val e1 = c1.uencoder underlying.agg(c1) } def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(Option[K1], U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(Option[K1], U1, U2, U3)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(Option[K1], U1, U2, U3, U4)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder underlying.agg(c1, c2, c3, c4) } def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(Option[K1], U1, U2, U3, U4, U5)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. */ object deserialized { def mapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => U): TypedDataset[U] = { underlying.deserialized.mapGroups(AggregatingOps.tuple1(f)) } def flatMapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { underlying.deserialized.flatMapGroups(AggregatingOps.tuple1(f)) } } def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: HNil, P] = PivotNotValues(self, g1 :: HNil, pivotColumn) } private[ops] abstract class RelationalGroups2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) { protected def underlying: RelationalGroupsOps[V, ::[TypedColumn[V, K1], ::[TypedColumn[V, K2], HNil]], ::[K1, ::[K2, HNil]], (K1, K2)] private implicit def eg1 = g1.uencoder private implicit def eg2 = g2.uencoder def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(Option[K1], Option[K2], U1)] = { implicit val e1 = c1.uencoder underlying.agg(c1) } def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(Option[K1], Option[K2], U1, U2)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder underlying.agg(c1, c2) } def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder underlying.agg(c1, c2, c3) } def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3, U4)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder underlying.agg(c1 , c2 , c3 , c4) } def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3, U4, U5)] = { implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder underlying.agg(c1, c2, c3, c4, c5) } /** Methods on `TypedDataset[T]` that go through a full serialization and * deserialization of `T`, and execute outside of the Catalyst runtime. */ object deserialized { def mapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => U): TypedDataset[U] = { underlying.deserialized.mapGroups(f) } def flatMapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = { underlying.deserialized.flatMapGroups(f) } } def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: TypedColumn[V, K2] :: HNil, P] = PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn) } class RollupManyOps[T, TK <: HList, K <: HList, KT](self: TypedDataset[T], groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], i2: Tupler.Aux[K, KT] ) extends RelationalGroupsOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.rollup(cols: _*)) class Rollup1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) extends RelationalGroups1Ops(self, g1) { override protected def underlying = new RollupManyOps(self, g1 :: HNil) } class Rollup2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) extends RelationalGroups2Ops(self, g1, g2) { override protected def underlying = new RollupManyOps(self, g1 :: g2 :: HNil) } class CubeManyOps[T, TK <: HList, K <: HList, KT](self: TypedDataset[T], groupedBy: TK) (implicit i0: ColumnTypes.Aux[T, TK, K], i1: ToTraversable.Aux[TK, List, UntypedExpression[T]], i2: Tupler.Aux[K, KT] ) extends RelationalGroupsOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.cube(cols: _*)) class Cube1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) extends RelationalGroups1Ops(self, g1) { override protected def underlying = new CubeManyOps(self, g1 :: HNil) } class Cube2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) extends RelationalGroups2Ops(self, g1, g2) { override protected def underlying = new CubeManyOps(self, g1 :: g2 :: HNil) } ================================================ FILE: dataset/src/main/scala/frameless/ops/Repeat.scala ================================================ package frameless package ops import shapeless.{HList, Nat, Succ} import shapeless.ops.hlist.Prepend /** Typeclass supporting repeating L-typed HLists N times. * * Repeat[Int :: String :: HNil, Nat._2].Out =:= * Int :: String :: Int :: String :: HNil * * By Jeremy Smith. To be replaced by `shapeless.ops.hlists.Repeat` * once (https://github.com/milessabin/shapeless/pull/730 is published. */ trait Repeat[L <: HList, N <: Nat] { type Out <: HList } object Repeat { type Aux[L <: HList, N <: Nat, Out0 <: HList] = Repeat[L, N] { type Out = Out0 } implicit def base[L <: HList]: Aux[L, Nat._1, L] = new Repeat[L, Nat._1] { type Out = L } implicit def succ[L <: HList, Prev <: Nat, PrevOut <: HList, P <: HList] (implicit i0: Aux[L, Prev, PrevOut], i1: Prepend.Aux[L, PrevOut, P] ): Aux[L, Succ[Prev], P] = new Repeat[L, Succ[Prev]] { type Out = P } } ================================================ FILE: dataset/src/main/scala/frameless/ops/SmartProject.scala ================================================ package frameless package ops import shapeless.ops.hlist.ToTraversable import shapeless.ops.record.{Keys, SelectAll, Values} import shapeless.{HList, LabelledGeneric} import scala.annotation.implicitNotFound @implicitNotFound(msg = "Cannot prove that ${T} can be projected to ${U}. Perhaps not all member names and types of ${U} are the same in ${T}?") case class SmartProject[T: TypedEncoder, U: TypedEncoder](apply: TypedDataset[T] => TypedDataset[U]) object SmartProject { /** * Proofs that there is a type-safe projection from a type T to another type U. It requires that: * (a) both T and U are Products for which a LabelledGeneric can be derived (e.g., case classes), * (b) all members of U have a corresponding member in T that has both the same name and type. * * @param i0 the LabelledGeneric derived for T * @param i1 the LabelledGeneric derived for U * @param i2 the keys of U * @param i3 selects all the values from T using the keys of U * @param i4 selects all the values of LabeledGeneric[U] * @param i5 proof that U and the projection of T have the same type * @param i6 allows for traversing the keys of U * @tparam T the original type T * @tparam U the projected type U * @tparam TRec shapeless' Record representation of T * @tparam TProj the projection of T using the keys of U * @tparam URec shapeless' Record representation of U * @tparam UVals the values of U as an HList * @tparam UKeys the keys of U as an HList * @return a projection if it exists */ implicit def deriveProduct[T: TypedEncoder, U: TypedEncoder, TRec <: HList, TProj <: HList, URec <: HList, UVals <: HList, UKeys <: HList] (implicit i0: LabelledGeneric.Aux[T, TRec], i1: LabelledGeneric.Aux[U, URec], i2: Keys.Aux[URec, UKeys], i3: SelectAll.Aux[TRec, UKeys, TProj], i4: Values.Aux[URec, UVals], i5: UVals =:= TProj, i6: ToTraversable.Aux[UKeys, Seq, Symbol] ): SmartProject[T,U] = SmartProject[T, U]({ from => val names = implicitly[Keys.Aux[URec, UKeys]].apply().to[Seq].map(_.name).map(from.dataset.col) TypedDataset.create(from.dataset.toDF().select(names: _*).as[U](TypedExpressionEncoder[U])) }) } ================================================ FILE: dataset/src/main/scala/frameless/syntax/package.scala ================================================ package frameless package object syntax extends FramelessSyntax { implicit val DefaultSparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob } ================================================ FILE: dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala ================================================ package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.{Alias, CreateStruct} import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.types._ import org.apache.spark.sql.types.ObjectType import scala.reflect.ClassTag object FramelessInternals { def objectTypeFor[A](implicit classTag: ClassTag[A]): ObjectType = ObjectType(classTag.runtimeClass) def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = { ds.toDF().queryExecution.analyzed.resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver).getOrElse { throw new AnalysisException( s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames.mkString(", ")})""") } } def expr(column: Column): Expression = column.expr def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution = ds.sparkSession.sessionState.executePlan(plan) def joinPlan(ds: Dataset[_], plan: LogicalPlan, leftPlan: LogicalPlan, rightPlan: LogicalPlan): LogicalPlan = { val joined = executePlan(ds, plan) val leftOutput = joined.analyzed.output.take(leftPlan.output.length) val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length) Project(List( Alias(CreateStruct(leftOutput), "_1")(), Alias(CreateStruct(rightOutput), "_2")() ), joined.analyzed) } def mkDataset[T](sqlContext: SQLContext, plan: LogicalPlan, encoder: Encoder[T]): Dataset[T] = new Dataset(sqlContext, plan, encoder) def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = Dataset.ofRows(sparkSession, logicalPlan) // because org.apache.spark.sql.types.UserDefinedType is private[spark] type UserDefinedType[A >: Null] = org.apache.spark.sql.types.UserDefinedType[A] // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins // - via files (codegen) forces doGenCode eval. /** Expression to tag columns from the left hand side of join expression. */ case class DisambiguateLeft[T](tagged: Expression) extends Expression with NonSQLExpression { def eval(input: InternalRow): Any = tagged.eval(input) def nullable: Boolean = false def children: Seq[Expression] = tagged :: Nil def dataType: DataType = tagged.dataType protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx) protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head) } /** Expression to tag columns from the right hand side of join expression. */ case class DisambiguateRight[T](tagged: Expression) extends Expression with NonSQLExpression { def eval(input: InternalRow): Any = tagged.eval(input) def nullable: Boolean = false def children: Seq[Expression] = tagged :: Nil def dataType: DataType = tagged.dataType protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx) protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head) } } ================================================ FILE: dataset/src/main/scala/org/apache/spark/sql/reflection/package.scala ================================================ package org.apache.spark.sql import org.apache.spark.sql.catalyst.ScalaReflection.{ cleanUpReflectionObjects, getClassFromType, localTypeOf } import org.apache.spark.sql.types.{ BinaryType, BooleanType, ByteType, CalendarIntervalType, DataType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, NullType, ObjectType, ShortType } import org.apache.spark.unsafe.types.CalendarInterval /** * Copy of spark's pre 3.4 reflection based encoding */ package object reflection { /** * copy of pre 3.5.0 isNativeType, https://issues.apache.org/jira/browse/SPARK-44343 removed it */ def isNativeType(dt: DataType): Boolean = dt match { case NullType | BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType | BinaryType | CalendarIntervalType => true case _ => false } private object ScalaSubtypeLock val universe: scala.reflect.runtime.universe.type = scala.reflect.runtime.universe import universe._ /** * Returns the Spark SQL DataType for a given scala type. Where this is not an exact mapping * to a native type, an ObjectType is returned. Special handling is also used for Arrays including * those that hold primitive types. * * Unlike `schemaFor`, this function doesn't do any massaging of types into the Spark SQL type * system. As a result, ObjectType will be returned for things like boxed Integers */ def dataTypeFor[T: TypeTag]: DataType = dataTypeFor(localTypeOf[T]) /** * Synchronize to prevent concurrent usage of `<:<` operator. * This operator is not thread safe in any current version of scala; i.e. * (2.11.12, 2.12.10, 2.13.0-M5). * * See https://github.com/scala/bug/issues/10766 */ private[sql] def isSubtype(tpe1: `Type`, tpe2: `Type`): Boolean = { ScalaSubtypeLock.synchronized { tpe1 <:< tpe2 } } private def dataTypeFor(tpe: `Type`): DataType = cleanUpReflectionObjects { tpe.dealias match { case t if isSubtype(t, definitions.NullTpe) => NullType case t if isSubtype(t, definitions.IntTpe) => IntegerType case t if isSubtype(t, definitions.LongTpe) => LongType case t if isSubtype(t, definitions.DoubleTpe) => DoubleType case t if isSubtype(t, definitions.FloatTpe) => FloatType case t if isSubtype(t, definitions.ShortTpe) => ShortType case t if isSubtype(t, definitions.ByteTpe) => ByteType case t if isSubtype(t, definitions.BooleanTpe) => BooleanType case t if isSubtype(t, localTypeOf[Array[Byte]]) => BinaryType case t if isSubtype(t, localTypeOf[CalendarInterval]) => CalendarIntervalType case t if isSubtype(t, localTypeOf[Decimal]) => DecimalType.SYSTEM_DEFAULT case _ => /* original Spark code checked for scala.Array vs ObjectType, this (and associated code) isn't needed due to TypedEncoders arrayEncoder */ val clazz = getClassFromType(tpe) ObjectType(clazz) } } } ================================================ FILE: dataset/src/main/spark-3/frameless/MapGroups.scala ================================================ package frameless import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MapGroups => SMapGroups} object MapGroups { def apply[K: Encoder, T: Encoder, U: Encoder]( func: (K, Iterator[T]) => TraversableOnce[U], groupingAttributes: Seq[Attribute], dataAttributes: Seq[Attribute], child: LogicalPlan ): LogicalPlan = SMapGroups(func, groupingAttributes, dataAttributes, child) } ================================================ FILE: dataset/src/main/spark-3.4+/frameless/MapGroups.scala ================================================ package frameless import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MapGroups => SMapGroups} object MapGroups { def apply[K: Encoder, T: Encoder, U: Encoder]( func: (K, Iterator[T]) => TraversableOnce[U], groupingAttributes: Seq[Attribute], dataAttributes: Seq[Attribute], child: LogicalPlan ): LogicalPlan = SMapGroups( func, groupingAttributes, dataAttributes, Seq(), // #698 - no order given child ) } ================================================ FILE: dataset/src/test/resources/log4j.properties ================================================ log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR log4j.logger.akka.event.slf4j=ERROR log4j.logger.akka.remote.EndpointWriter=ERROR log4j.logger.akka.remote.RemoteActorRefProvider$RemotingTerminator=ERROR log4j.logger.com.anjuke.dm=ERROR log4j.logger.io.netty.bootstrap.ServerBootstrap=ERROR log4j.logger.io.netty.buffer.ByteBufUtil=ERROR log4j.logger.io.netty.buffer.PooledByteBufAllocator=ERROR log4j.logger.io.netty.channel.AbstractChannel=ERROR log4j.logger.io.netty.channel.ChannelInitializer=ERROR log4j.logger.io.netty.channel.ChannelOutboundBuffer=ERROR log4j.logger.io.netty.channel.DefaultChannelPipeline=ERROR log4j.logger.io.netty.channel.MultithreadEventLoopGroup=ERROR log4j.logger.io.netty.channel.nio.AbstractNioChannel=ERROR log4j.logger.io.netty.channel.nio.NioEventLoop=ERROR log4j.logger.io.netty.channel.socket.nio.NioServerSocketChannel=ERROR log4j.logger.io.netty.util.concurrent.DefaultPromise.rejectedExecution=ERROR log4j.logger.io.netty.util.concurrent.DefaultPromise=ERROR log4j.logger.io.netty.util.concurrent.GlobalEventExecutor=ERROR log4j.logger.io.netty.util.concurrent.SingleThreadEventExecutor=ERROR log4j.logger.io.netty.util.internal.logging.InternalLoggerFactory=ERROR log4j.logger.io.netty.util.internal.PlatformDependent0=ERROR log4j.logger.io.netty.util.internal.PlatformDependent=ERROR log4j.logger.io.netty.util.internal.SystemPropertyUtil=ERROR log4j.logger.io.netty.util.internal.ThreadLocalRandom=ERROR log4j.logger.io.netty.util.NetUtil=ERROR log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR log4j.logger.org.apache.hadoop.conf.Configuration=ERROR log4j.logger.org.apache.hadoop.fs.FileSystem=ERROR log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=ERROR log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR log4j.logger.org.apache.hadoop.mapred.JobConf=ERROR log4j.logger.org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedPartitioner=ERROR log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.Interns=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.MetricsSourceBuilder=ERROR log4j.logger.org.apache.hadoop.metrics2.lib.MutableMetricsFactory=ERROR log4j.logger.org.apache.hadoop.security.authentication.util.KerberosName=ERROR log4j.logger.org.apache.hadoop.security.Groups=ERROR log4j.logger.org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback=ERROR log4j.logger.org.apache.hadoop.security.SecurityUtil=ERROR log4j.logger.org.apache.hadoop.security.ShellBasedUnixGroupsMapping=ERROR log4j.logger.org.apache.hadoop.security.UserGroupInformation=ERROR log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR log4j.logger.org.apache.hadoop.util.ShutdownHookManager=ERROR log4j.logger.org.apache.spark.broadcast.TorrentBroadcast=ERROR log4j.logger.org.apache.spark.ContextCleaner=ERROR log4j.logger.org.apache.spark.executor.Executor=ERROR log4j.logger.org.apache.spark.HeartbeatReceiver=ERROR log4j.logger.org.apache.spark.HttpFileServer=ERROR log4j.logger.org.apache.spark.HttpServer=ERROR log4j.logger.org.apache.spark.MapOutputTrackerMaster=ERROR log4j.logger.org.apache.spark.MapOutputTrackerMasterEndpoint=ERROR log4j.logger.org.apache.spark.metrics.MetricsSystem=ERROR log4j.logger.org.apache.spark.network.client.TransportClientFactory=ERROR log4j.logger.org.apache.spark.network.netty.NettyBlockTransferService=ERROR log4j.logger.org.apache.spark.network.protocol.MessageDecoder=ERROR log4j.logger.org.apache.spark.network.protocol.MessageEncoder=ERROR log4j.logger.org.apache.spark.network.server.OneForOneStreamManager=ERROR log4j.logger.org.apache.spark.network.server.TransportServer=ERROR log4j.logger.org.apache.spark.network.TransportContext=ERROR log4j.logger.org.apache.spark.network.util.JavaUtils=ERROR log4j.logger.org.apache.spark.rdd.CoGroupedRDD=ERROR log4j.logger.org.apache.spark.rdd.SubtractedRDD=ERROR log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR log4j.logger.org.apache.spark.rpc.akka.AkkaRpcEnv$$anonfun$actorRef$lzycompute$1$1$$anon$1=ERROR log4j.logger.org.apache.spark.scheduler.DAGScheduler=ERROR log4j.logger.org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint=ERROR log4j.logger.org.apache.spark.scheduler.TaskSchedulerImpl=ERROR log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR log4j.logger.org.apache.spark.SecurityManager=ERROR log4j.logger.org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter=ERROR log4j.logger.org.apache.spark.SparkContext=ERROR log4j.logger.org.apache.spark.SparkEnv=ERROR log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner=ERROR log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer=ERROR log4j.logger.org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer=ERROR log4j.logger.org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys=ERROR log4j.logger.org.apache.spark.sql.execution.aggregate.SortBasedAggregate=ERROR log4j.logger.org.apache.spark.sql.execution.aggregate.TungstenAggregate=ERROR log4j.logger.org.apache.spark.sql.execution.Exchange=ERROR log4j.logger.org.apache.spark.sql.execution.joins.ShuffledHashOuterJoin=ERROR log4j.logger.org.apache.spark.sql.SQLContext$$anon$1=ERROR log4j.logger.org.apache.spark.sql.SQLContext$$anon$2=ERROR log4j.logger.org.apache.spark.SSLOptions=ERROR log4j.logger.org.apache.spark.storage.BlockManager=ERROR log4j.logger.org.apache.spark.storage.BlockManagerInfo=ERROR log4j.logger.org.apache.spark.storage.BlockManagerMaster=ERROR log4j.logger.org.apache.spark.storage.BlockManagerMasterEndpoint=ERROR log4j.logger.org.apache.spark.storage.BlockManagerSlaveEndpoint=ERROR log4j.logger.org.apache.spark.storage.DiskBlockManager=ERROR log4j.logger.org.apache.spark.storage.MemoryStore=ERROR log4j.logger.org.apache.spark.storage.ShuffleBlockFetcherIterator=ERROR log4j.logger.org.apache.spark.ui.SparkUI=ERROR log4j.logger.org.apache.spark.unsafe.map.BytesToBytesMap=ERROR log4j.logger.org.apache.spark.unsafe.memory.TaskMemoryManager=ERROR log4j.logger.org.apache.spark.util.AkkaUtils=ERROR log4j.logger.org.apache.spark.util.ClosureCleaner=ERROR log4j.logger.org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter=ERROR log4j.logger.org.apache.spark.util.Utils=ERROR log4j.logger.org.apache.spark=ERROR log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.eclipse.jetty=ERROR log4j.logger.org.spark-project.jetty.http.AbstractGenerator=ERROR log4j.logger.org.spark-project.jetty.http.HttpGenerator=ERROR log4j.logger.org.spark-project.jetty.http.MimeTypes=ERROR log4j.logger.org.spark-project.jetty.io.AbstractBuffer=ERROR log4j.logger.org.spark-project.jetty.io.nio=ERROR log4j.logger.org.spark-project.jetty.server.AbstractConnector=ERROR log4j.logger.org.spark-project.jetty.server.bio.SocketConnector=ERROR log4j.logger.org.spark-project.jetty.server.handler.AbstractHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ContextHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ContextHandlerCollection=ERROR log4j.logger.org.spark-project.jetty.server.handler.DefaultHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ErrorHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.GzipHandler=ERROR log4j.logger.org.spark-project.jetty.server.handler.ResourceHandler=ERROR log4j.logger.org.spark-project.jetty.server.Server=ERROR log4j.logger.org.spark-project.jetty.server=ERROR log4j.logger.org.spark-project.jetty.servlet.DefaultServlet=ERROR log4j.logger.org.spark-project.jetty.servlet.Holder=ERROR log4j.logger.org.spark-project.jetty.servlet.ServletHandler=ERROR log4j.logger.org.spark-project.jetty.servlet.ServletHolder=ERROR log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.spark-project.jetty.util.component.AggregateLifeCycle=ERROR log4j.logger.org.spark-project.jetty.util.component.Container=ERROR log4j.logger.org.spark-project.jetty.util.IO=ERROR log4j.logger.org.spark-project.jetty.util.log=ERROR log4j.logger.org.spark-project.jetty.util.resource.FileResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.JarFileResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.JarResource=ERROR log4j.logger.org.spark-project.jetty.util.resource.Resource=ERROR log4j.logger.org.spark-project.jetty.util.resource.URLResource=ERROR log4j.logger.org.spark-project.jetty.util.StringUtil=ERROR log4j.logger.org.spark-project.jetty.util.thread.QueuedThreadPool=ERROR log4j.logger.org.spark-project.jetty.util.thread.Timeout=ERROR log4j.logger.org.spark-project.jetty=ERROR log4j.logger.Remoting=ERROR ================================================ FILE: dataset/src/test/resources/log4j2.properties ================================================ # Set to debug or trace if log4j initialization is failing status = warn # Name of the configuration name = ConsoleAppender # Console appender configuration appender.console.type = Console appender.console.name = consoleLogger appender.console.layout.type = PatternLayout appender.console.layout.pattern = %d{YYYY-MM-dd HH:mm:ss} [%t] %-5p %c:%L - %m%n appender.console.target = SYSTEM_OUT # Root logger level rootLogger.level = error # Root logger referring to console appender rootLogger.appenderRef.stdout.ref = consoleLogger logger.spark.name = org.apache.spark logger.spark.level = warn logger.hadoop.name = org.apache.hadoop logger.hadoop.level = warn # To debug expressions: #logger.codegen.name = org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator #logger.codegen.level = debug ================================================ FILE: dataset/src/test/scala/frameless/AsTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class AsTests extends TypedDatasetSuite { test("as[X2[A, B]]") { def prop[A, B](data: Vector[(A, B)])( implicit eab: TypedEncoder[(A, B)], ex2: TypedEncoder[X2[A, B]] ): Prop = { val dataset = TypedDataset.create(data) val dataset2 = dataset.as[X2[A,B]]().collect().run().toVector val data2 = data.map { case (a, b) => X2(a, b) } dataset2 ?= data2 } check(forAll(prop[Int, Int] _)) check(forAll(prop[String, String] _)) check(forAll(prop[String, Int] _)) check(forAll(prop[Long, Int] _)) check(forAll(prop[Seq[Seq[Option[Seq[Long]]]], Seq[Int]] _)) check(forAll(prop[Seq[Option[Seq[String]]], Seq[Int]] _)) } test("as[X2[X2[A, B], C]") { def prop[A, B, C](data: Vector[(A, B, C)])( implicit eab: TypedEncoder[((A, B), C)], ex2: TypedEncoder[X2[X2[A, B], C]] ): Prop = { val data2 = data.map { case (a, b, c) => ((a, b), c) } val dataset = TypedDataset.create(data2) val dataset2 = dataset.as[X2[X2[A,B], C]]().collect().run().toVector val data3 = data2.map { case ((a, b), c) => X2(X2(a, b), c) } dataset2 ?= data3 } check(forAll(prop[String, Int, Int] _)) check(forAll(prop[String, Int, String] _)) check(forAll(prop[String, String, Int] _)) check(forAll(prop[Long, Int, String] _)) check(forAll(prop[Seq[Seq[Option[Seq[Long]]]], Seq[Int], Option[Seq[Option[Int]]]] _)) check(forAll(prop[Seq[Option[Seq[String]]], Seq[Int], Seq[Option[String]]] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/BitwiseTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers class BitwiseTests extends TypedDatasetSuite with Matchers { /** * providing instances with implementations for bitwise operations since in the tests * we need to check the results from frameless vs the results from normal scala operators * for Numeric it is easy to test since scala comes with Numeric typeclass but there seems * to be no equivalent typeclass for bitwise ops for Byte Short Int and Long types supported in Catalyst */ trait CatalystBitwise4Tests[A]{ def bitwiseAnd(a1: A, a2: A): A def bitwiseOr(a1: A, a2: A): A def bitwiseXor(a1: A, a2: A): A def &(a1: A, a2: A): A = bitwiseAnd(a1, a2) def |(a1: A, a2: A): A = bitwiseOr(a1, a2) def ^(a1: A, a2: A): A = bitwiseXor(a1, a2) } object CatalystBitwise4Tests { implicit val framelessbyteBitwise : CatalystBitwise4Tests[Byte] = new CatalystBitwise4Tests[Byte] { def bitwiseOr(a1: Byte, a2: Byte) : Byte = (a1 | a2).toByte def bitwiseAnd(a1: Byte, a2: Byte): Byte = (a1 & a2).toByte def bitwiseXor(a1: Byte, a2: Byte): Byte = (a1 ^ a2).toByte } implicit val framelessshortBitwise : CatalystBitwise4Tests[Short] = new CatalystBitwise4Tests[Short] { def bitwiseOr(a1: Short, a2: Short) : Short = (a1 | a2).toShort def bitwiseAnd(a1: Short, a2: Short): Short = (a1 & a2).toShort def bitwiseXor(a1: Short, a2: Short): Short = (a1 ^ a2).toShort } implicit val framelessintBitwise : CatalystBitwise4Tests[Int] = new CatalystBitwise4Tests[Int] { def bitwiseOr(a1: Int, a2: Int) : Int = a1 | a2 def bitwiseAnd(a1: Int, a2: Int): Int = a1 & a2 def bitwiseXor(a1: Int, a2: Int): Int = a1 ^ a2 } implicit val framelesslongBitwise : CatalystBitwise4Tests[Long] = new CatalystBitwise4Tests[Long] { def bitwiseOr(a1: Long, a2: Long) : Long = a1 | a2 def bitwiseAnd(a1: Long, a2: Long): Long = a1 & a2 def bitwiseXor(a1: Long, a2: Long): Long = a1 ^ a2 } } import CatalystBitwise4Tests._ test("bitwiseAND") { def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)( implicit catalystBitwise4Tests: CatalystBitwise4Tests[A] ): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[CatalystBitwise4Tests[A]].bitwiseAnd(a, b) val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].&(a, b) val got = df.select(df.col('a) bitwiseAND df.col('b)).collect().run() val gotSymbolic = df.select(df.col('a) & b).collect().run() val symbolicCol2Col = df.select(df.col('a) & df.col('b)).collect().run() val canCast = df.select(df.col('a).cast[Long] & 0L).collect().run() canCast should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(0L) result ?= resultSymbolic symbolicCol2Col ?= (result :: Nil) got ?= (result :: Nil) gotSymbolic ?= (resultSymbolic :: Nil) } check(prop[Byte] _) check(prop[Short] _) check(prop[Int] _) check(prop[Long] _) } test("bitwiseOR") { def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)( implicit catalystBitwise4Tests: CatalystBitwise4Tests[A] ): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[CatalystBitwise4Tests[A]].bitwiseOr(a, b) val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].|(a, b) val got = df.select(df.col('a) bitwiseOR df.col('b)).collect().run() val gotSymbolic = df.select(df.col('a) | b).collect().run() val symbolicCol2Col = df.select(df.col('a) | df.col('b)).collect().run() val canCast = df.select(df.col('a).cast[Long] | -1L).collect().run() canCast should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(-1L) result ?= resultSymbolic symbolicCol2Col ?= (result :: Nil) got ?= (result :: Nil) gotSymbolic ?= (resultSymbolic :: Nil) } check(prop[Byte] _) check(prop[Short] _) check(prop[Int] _) check(prop[Long] _) } test("bitwiseXOR") { def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)( implicit catalystBitwise4Tests: CatalystBitwise4Tests[A] ): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[CatalystBitwise4Tests[A]].bitwiseXor(a, b) val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].^(a, b) result ?= resultSymbolic val got = df.select(df.col('a) bitwiseXOR df.col('b)).collect().run() val gotSymbolic = df.select(df.col('a) ^ b).collect().run() val zeroes = df.select(df.col('a) ^ df.col('a)).collect().run() zeroes should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(0L) got ?= (result :: Nil) gotSymbolic ?= (resultSymbolic :: Nil) } check(prop[Byte] _) check(prop[Short] _) check(prop[Int] _) check(prop[Long] _) } } ================================================ FILE: dataset/src/test/scala/frameless/CastTests.scala ================================================ package frameless import org.scalacheck.{Arbitrary, Gen, Prop} import org.scalacheck.Prop._ class CastTests extends TypedDatasetSuite { def prop[A: TypedEncoder, B: TypedEncoder](f: A => B)(a: A)( implicit cast: CatalystCast[A, B] ): Prop = { val df = TypedDataset.create(X1(a) :: Nil) val got = df.select(df.col('a).cast[B]).collect().run() got ?= (f(a) :: Nil) } test("cast") { // numericToDecimal check(prop[BigDecimal, BigDecimal](identity) _) check(prop[Byte, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _) check(prop[Double, BigDecimal](BigDecimal.valueOf) _) check(prop[Int, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _) check(prop[Long, BigDecimal](BigDecimal.valueOf) _) check(prop[Short, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _) // numericToByte check(prop[BigDecimal, Byte](_.toByte) _) check(prop[Byte, Byte](identity) _) check(prop[Double, Byte](_.toByte) _) check(prop[Int, Byte](_.toByte) _) check(prop[Long, Byte](_.toByte) _) check(prop[Short, Byte](_.toByte) _) // numericToDouble check(prop[BigDecimal, Double](_.toDouble) _) check(prop[Byte, Double](_.toDouble) _) check(prop[Double, Double](identity) _) check(prop[Int, Double](_.toDouble) _) check(prop[Long, Double](_.toDouble) _) check(prop[Short, Double](_.toDouble) _) // numericToInt check(prop[BigDecimal, Int](_.toInt) _) check(prop[Byte, Int](_.toInt) _) check(prop[Double, Int](_.toInt) _) check(prop[Int, Int](identity) _) check(prop[Long, Int](_.toInt) _) check(prop[Short, Int](_.toInt) _) // numericToLong check(prop[BigDecimal, Long](_.toLong) _) check(prop[Byte, Long](_.toLong) _) check(prop[Double, Long](_.toLong) _) check(prop[Int, Long](_.toLong) _) check(prop[Long, Long](identity) _) check(prop[Short, Long](_.toLong) _) // numericToShort check(prop[BigDecimal, Short](_.toShort) _) check(prop[Byte, Short](_.toShort) _) check(prop[Double, Short](_.toShort) _) check(prop[Int, Short](_.toShort) _) check(prop[Long, Short](_.toShort) _) check(prop[Short, Short](identity) _) // castToString // TODO compare without trailing zeros // check(prop[BigDecimal, String](_.toString()) _) check(prop[Byte, String](_.toString) _) check(prop[Double, String](_.toString) _) check(prop[Int, String](_.toString) _) check(prop[Long, String](_.toString) _) check(prop[Short, String](_.toString) _) // stringToBoolean val trueStrings = Set("t", "true", "y", "yes", "1") val falseStrings = Set("f", "false", "n", "no", "0") def stringToBoolean(str: String): Option[Boolean] = { if (trueStrings(str)) Some(true) else if (falseStrings(str)) Some(false) else None } val stringToBooleanGen = Gen.oneOf( Gen.oneOf(trueStrings.toSeq), Gen.oneOf(falseStrings.toSeq), Arbitrary.arbitrary[String] ) check(forAll(stringToBooleanGen)(prop(stringToBoolean))) // xxxToBoolean check(prop[BigDecimal, Boolean](_ != BigDecimal(0)) _) check(prop[Byte, Boolean](_ != 0) _) check(prop[Double, Boolean](_ != 0) _) check(prop[Int, Boolean](_ != 0) _) check(prop[Long, Boolean](_ != 0L) _) check(prop[Short, Boolean](_ != 0) _) // booleanToNumeric check(prop[Boolean, BigDecimal](x => if (x) BigDecimal(1) else BigDecimal(0)) _) check(prop[Boolean, Byte](x => if (x) 1 else 0) _) check(prop[Boolean, Double](x => if (x) 1.0f else 0.0f) _) check(prop[Boolean, Int](x => if (x) 1 else 0) _) check(prop[Boolean, Long](x => if (x) 1L else 0L) _) check(prop[Boolean, Short](x => if (x) 1 else 0) _) } } ================================================ FILE: dataset/src/test/scala/frameless/ColTests.scala ================================================ package frameless import shapeless.test.illTyped import org.scalacheck.Prop import org.scalacheck.Prop._ class ColTests extends TypedDatasetSuite { test("col") { val x4 = TypedDataset.create[X4[Int, String, Long, Boolean]](Nil) val t4 = TypedDataset.create[(Int, String, Long, Boolean)](Nil) x4.col('a) t4.col('_1) x4.col[Int]('a) t4.col[Int]('_1) illTyped("x4.col[String]('a)", "No column .* of type String in frameless.X4.*") x4.col('b) t4.col('_2) x4.col[String]('b) t4.col[String]('_2) illTyped("x4.col[Int]('b)", "No column .* of type Int in frameless.X4.*") () } test("colMany") { type X2X2 = X2[X2[Int, String], X2[Long, Boolean]] val x2x2 = TypedDataset.create[X2X2](Nil) val aa: TypedColumn[X2X2, Int] = x2x2.colMany('a, 'a) val ab: TypedColumn[X2X2, String] = x2x2.colMany('a, 'b) val ba: TypedColumn[X2X2, Long] = x2x2.colMany('b, 'a) val bb: TypedColumn[X2X2, Boolean] = x2x2.colMany('b, 'b) illTyped("x2x2.colMany('a, 'c)") illTyped("x2x2.colMany('a, 'a, 'a)") } test("select colMany") { def prop[A: TypedEncoder](x: X2[X2[A, A], A]): Prop = { val df = TypedDataset.create(x :: Nil) val got = df.select(df.colMany('a, 'a)).collect().run() got ?= (x.a.a :: Nil) } check(prop[Int] _) check(prop[X2[Int, Int]] _) check(prop[X2[X2[Int, Int], Int]] _) } } ================================================ FILE: dataset/src/test/scala/frameless/CollectTests.scala ================================================ package frameless import frameless.CollectTests.{ prop, propArray } import org.apache.spark.sql.SparkSession import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.reflect.ClassTag class CollectTests extends TypedDatasetSuite { test("collect()") { check(forAll(propArray[Int] _)) check(forAll(propArray[Long] _)) check(forAll(propArray[Boolean] _)) check(forAll(propArray[Float] _)) check(forAll(propArray[String] _)) check(forAll(propArray[Byte] _)) check(forAll(propArray[Option[Int]] _)) check(forAll(propArray[Option[Long]] _)) check(forAll(propArray[Option[Double]] _)) check(forAll(propArray[Option[Float]] _)) check(forAll(propArray[Option[Short]] _)) check(forAll(propArray[Option[Byte]] _)) check(forAll(propArray[Option[Boolean]] _)) check(forAll(propArray[Option[String]] _)) check(forAll(prop[X2[Int, Int]] _)) check(forAll(prop[X2[String, String]] _)) check(forAll(prop[X2[String, Int]] _)) check(forAll(prop[X2[Long, Int]] _)) check(forAll(prop[X2[X2[Int, String], Boolean]] _)) check(forAll(prop[Tuple1[Option[Int]]] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) check(forAll(prop[Float] _)) check(forAll(prop[Short] _)) check(forAll(prop[Char] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[String] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[Option[Int]] _)) check(forAll(prop[Option[Long]] _)) check(forAll(prop[Option[Double]] _)) check(forAll(prop[Option[Float]] _)) check(forAll(prop[Option[Short]] _)) check(forAll(prop[Option[Byte]] _)) check(forAll(prop[Option[Boolean]] _)) check(forAll(prop[Option[String]] _)) check(forAll(prop[Option[SQLDate]] _)) check(forAll(prop[Option[SQLTimestamp]] _)) check(forAll(prop[Vector[Int]] _)) check(forAll(prop[List[Int]] _)) check(forAll(prop[Seq[Int]] _)) check(forAll(prop[Vector[Char]] _)) check(forAll(prop[List[Char]] _)) check(forAll(prop[Seq[Char]] _)) check(forAll(prop[Set[Char]] _)) check(forAll(prop[Seq[Seq[Seq[Char]]]] _)) check(forAll(prop[Seq[Option[String]]] _)) check(forAll(prop[Seq[Map[String, Long]]] _)) check(forAll(prop[Seq[Map[String, X2[Option[Long], Vector[Boolean]]]]] _)) check(forAll(prop[Option[Int]] _)) check(forAll(prop[Vector[X2[Int, Int]]] _)) check(forAll(prop[X1[Vector[Food]]] _)) check(forAll(prop[X1[Vector[X1[Food]]]] _)) check(forAll(prop[X1[Vector[X1[Int]]]] _)) // TODO this doesn't work, and never worked... // check(forAll(prop[X1[Option[X1[Option[Int]]]]] _)) check(forAll(prop[UdtEncodedClass] _)) check(forAll(prop[Option[UdtEncodedClass]] _)) check(forAll(prop[X1[UdtEncodedClass]] _)) check(forAll(prop[X2[Int, UdtEncodedClass]] _)) check(forAll(prop[(Long, UdtEncodedClass)] _)) } } object CollectTests { import frameless.syntax._ def prop[A: TypedEncoder : ClassTag](data: Vector[A])(implicit c: SparkSession): Prop = TypedDataset.create(data).collect().run().toVector ?= data def propArray[A: TypedEncoder : ClassTag](data: Vector[X1[Array[A]]])(implicit c: SparkSession): Prop = Prop(TypedDataset.create(data).collect().run().toVector.zip(data).forall { case (X1(l), X1(r)) => l.sameElements(r) }) } ================================================ FILE: dataset/src/test/scala/frameless/ColumnTests.scala ================================================ package frameless import java.util.Date import java.math.BigInteger import java.time.{ Instant, LocalDate, Period, Duration } import java.time.temporal.ChronoUnit import java.sql.{ Date => SqlDate, Timestamp } import scala.math.Ordering.Implicits._ import scala.util.Try import org.scalacheck.{ Arbitrary, Gen, Prop }, Arbitrary.arbitrary, Prop._ import org.scalatest.matchers.should.Matchers import shapeless.test.illTyped final class ColumnTests extends TypedDatasetSuite with Matchers { implicit val timestampArb: Arbitrary[Timestamp] = Arbitrary { OrderingImplicits.arbInstant.arbitrary.map { i => Timestamp from i.truncatedTo(ChronoUnit.MILLIS) } } implicit val dateArb: Arbitrary[Date] = Arbitrary { OrderingImplicits.arbInstant.arbitrary.map(Date from _) } private implicit object OrderingImplicits { implicit val sqlDateOrdering: Ordering[SQLDate] = Ordering.by(_.days) implicit val sqlTimestmapOrdering: Ordering[SQLTimestamp] = Ordering.by(_.us) implicit val periodOrdering: Ordering[Period] = Ordering.by(p => (p.getYears, p.getMonths, p.getDays)) /** * DateTimeUtils.instantToMicros supports dates starting 1970-01-01T00:00:00Z, which is Instant.EPOCH. * This function also overflows on Instant.MAX, to be sure it never overflows we use Instant.MAX / 4. * For implementation details check the org.apache.spark.sql.catalyst.util.DateTimeUtils.instantToMicros function details. */ val genInstant = Gen.choose[Instant]( Instant.EPOCH, Instant.ofEpochMilli(Instant.MAX.getEpochSecond / 4) ) implicit val arbInstant: Arbitrary[Instant] = Arbitrary(genInstant) implicit val arbDuration: Arbitrary[Duration] = Arbitrary( genInstant.map(i => Duration.ofMillis(i.toEpochMilli)) ) implicit val arbPeriod: Arbitrary[Period] = Arbitrary( Gen.chooseNum(0, Int.MaxValue).map(l => Period.of(l, l, l)) ) } test("select('a < 'b, 'a <= 'b, 'a > 'b, 'a >= 'b)") { import OrderingImplicits._ def prop[A: TypedEncoder: CatalystOrdered: Ordering](a: A, b: A): Prop = { val dataset = TypedDataset.create(X2(a, b) :: Nil) val A = dataset.col('a) val B = dataset.col('b) val dataset2 = dataset .selectMany( A < B, A < b, // One test uses columns, other uses literals A <= B, A <= b, A > B, A > b, A >= B, A >= b ) .collect() .run() .toVector dataset2 ?= Vector( (a < b, a < b, a <= b, a <= b, a > b, a > b, a >= b, a >= b) ) } check(forAll(prop[Int] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Long] _)) check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[String] _)) check(forAll(prop[Instant] _)) check(forAll(prop[Duration] _)) check(forAll(prop[Period] _)) } test("between") { import OrderingImplicits._ def prop[A: TypedEncoder: CatalystOrdered: Ordering]( a: A, b: A, c: A ): Prop = { val dataset = TypedDataset.create(X3(a, b, c) :: Nil) val A = dataset.col('a) val B = dataset.col('b) val C = dataset.col('c) val isBetweeen = dataset .selectMany(A.between(B, C), A.between(b, c)) .collect() .run() .toVector val result = b <= a && a <= c isBetweeen ?= Vector((result, result)) } check(forAll(prop[Int] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Long] _)) check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[String] _)) check(forAll(prop[Instant] _)) check(forAll(prop[Duration] _)) check(forAll(prop[Period] _)) } test("toString") { val t = TypedDataset.create((1, 2) :: Nil) t('_1).toString ?= t.dataset.col("_1").toString() } test("boolean and / or") { val spark = session import spark.implicits._ check { forAll { (s: Seq[X3[Boolean, Boolean, Boolean]]) => val ds = TypedDataset.create(s) val typedBoolean = ds .select( ds('a) && ds('b) || ds('c), ds('a).and(ds('b)).or(ds('c)) ) .collect() .run() .toList val untypedDs = ds.toDF() val untypedBoolean = untypedDs .select( untypedDs("a") && untypedDs("b") || untypedDs("c"), untypedDs("a").and(untypedDs("b")).or(untypedDs("c")) ) .as[(Boolean, Boolean)] .collect() .toList typedBoolean ?= untypedBoolean } } } test("substr") { val spark = session import spark.implicits._ check { forAll { (a: String, b: Int, c: Int) => val ds = TypedDataset.create(X3(a, b, c) :: Nil) val typedSubstr = ds.select(ds('a).substr(ds('b), ds('c))).collect().run().toList val untypedDs = ds.toDF() val untypedSubstr = untypedDs .select(untypedDs("a").substr(untypedDs("b"), untypedDs("c"))) .as[String] .collect() .toList typedSubstr ?= untypedSubstr } } check { forAll { (a: String, b: Int, c: Int) => val ds = TypedDataset.create(X1(a) :: Nil) val typedSubstr = ds.select(ds('a).substr(b, c)).collect().run().toList val untypedDs = ds.toDF() val untypedSubstr = untypedDs .select(untypedDs("a").substr(b, c)) .as[String] .collect() .toList typedSubstr ?= untypedSubstr } } val ds1 = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds1.select(ds1('_1).substr(0, 5))""") illTyped("""ds1.select(ds1('_2).substr(0, 5))""") illTyped("""ds1.select(ds1('_3).substr(0, 5))""") illTyped("""ds1.select(ds1('_1).substr(ds1('_2), ds1('_3)))""") } test("like") { val spark = session import spark.implicits._ check { forAll { (a: String, b: String) => val ds = TypedDataset.create(X2(a, b) :: Nil) val typedLike = ds.select(ds('a).like(a), ds('b).like(a)).collect().run().toList val untypedDs = ds.toDF() val untypedLike = untypedDs .select(untypedDs("a").like(a), untypedDs("b").like(a)) .as[(Boolean, Boolean)] .collect() .toList typedLike ?= untypedLike } } val ds = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds.select(ds('_1).like("foo"))""") illTyped("""ds.select(ds('_2).like("foo"))""") illTyped("""ds.select(ds('_3).like("foo"))""") } test("rlike") { val spark = session import spark.implicits._ val regex = Gen.nonEmptyListOf(arbitrary[Char]).map(_.mkString).suchThat { str => Try(str.r).isSuccess } check { forAll(regex, arbitrary[String]) { (a, b) => val ds = TypedDataset.create(X2(a, b) :: Nil) val typedLike = ds .select(ds('a).rlike(a), ds('b).rlike(a), ds('a).rlike(".*")) .collect() .run() .toList val untypedDs = ds.toDF() val untypedLike = untypedDs .select( untypedDs("a").rlike(a), untypedDs("b").rlike(a), untypedDs("a").rlike(".*") ) .as[(Boolean, Boolean, Boolean)] .collect() .toList typedLike ?= untypedLike } } val ds = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds.select(ds('_1).rlike("foo"))""") illTyped("""ds.select(ds('_2).rlike("foo"))""") illTyped("""ds.select(ds('_3).rlike("foo"))""") } test("contains") { val spark = session import spark.implicits._ check { forAll { (a: String, b: String) => val ds = TypedDataset.create(X2(a, b) :: Nil) val typedContains = ds .select(ds('a).contains(ds('b)), ds('b).contains(a)) .collect() .run() .toList val untypedDs = ds.toDF() val untypedContains = untypedDs .select( untypedDs("a").contains(untypedDs("b")), untypedDs("b").contains(a) ) .as[(Boolean, Boolean)] .collect() .toList typedContains ?= untypedContains } } val ds = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds.select(ds('_1).contains("foo"))""") illTyped("""ds.select(ds('_2).contains("foo"))""") illTyped("""ds.select(ds('_3).contains("foo"))""") } test("startsWith") { val spark = session import spark.implicits._ check { forAll { (a: String, b: String) => val ds = TypedDataset.create(X2(a, b) :: Nil) val typedStartsWith = ds .select(ds('a).startsWith(ds('b)), ds('b).startsWith(a)) .collect() .run() .toList val untypedDs = ds.toDF() val untypedStartsWith = untypedDs .select( untypedDs("a").startsWith(untypedDs("b")), untypedDs("b").startsWith(a) ) .as[(Boolean, Boolean)] .collect() .toList typedStartsWith ?= untypedStartsWith } } val ds = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds.select(ds('_1).startsWith("foo"))""") illTyped("""ds.select(ds('_2).startsWith("foo"))""") illTyped("""ds.select(ds('_3).startsWith("foo"))""") } test("endsWith") { val spark = session import spark.implicits._ check { forAll { (a: String, b: String) => val ds = TypedDataset.create(X2(a, b) :: Nil) val typedStartsWith = ds .select(ds('a).endsWith(ds('b)), ds('b).endsWith(a)) .collect() .run() .toList val untypedDs = ds.toDF() val untypedStartsWith = untypedDs .select( untypedDs("a").endsWith(untypedDs("b")), untypedDs("b").endsWith(a) ) .as[(Boolean, Boolean)] .collect() .toList typedStartsWith ?= untypedStartsWith } } val ds = TypedDataset.create((1, false, 2.0) :: Nil) illTyped("""ds.select(ds('_1).endsWith("foo"))""") illTyped("""ds.select(ds('_2).endsWith("foo"))""") illTyped("""ds.select(ds('_3).endsWith("foo"))""") } test("getOrElse") { def prop[A: TypedEncoder](a: A, opt: Option[A]) = { val dataset = TypedDataset.create(X2(a, opt) :: Nil) val defaulted: (A, A) = dataset .select(dataset('b).getOrElse(dataset('a)), dataset('b).getOrElse(a)) .collect() .run() .toList .head defaulted ?= (opt.getOrElse(a) -> opt.getOrElse(a)) } check(forAll(prop[Int] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Long] _)) check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[Date] _)) check(forAll(prop[Timestamp] _)) check(forAll(prop[String] _)) // Scalacheck is too slow check(prop[BigInt](BigInt(Long.MaxValue).+(BigInt(Long.MaxValue)), None)) check(prop[BigInt](BigInt("0"), Some(BigInt(Long.MaxValue)))) check( prop[BigInt]( BigInt(Long.MinValue).-(BigInt(Long.MinValue)), Some(BigInt("0")) ) ) check( prop[BigInteger]( BigInteger .valueOf(Long.MaxValue) .add(BigInteger.valueOf(Long.MaxValue)), None ) ) check( prop[BigInteger]( BigInteger.valueOf(0L), Some(BigInteger.valueOf(Long.MaxValue)) ) ) check( prop[BigInteger]( BigInteger .valueOf(Long.MinValue) .subtract(BigInteger.valueOf(Long.MinValue)), Some(BigInteger.valueOf(0L)) ) ) } test("Consistency with Spark internal date/time representation") { val ts = Timestamp.from(Instant parse "1990-01-01T01:00:00.000Z") val date = Date.from(Instant parse "1991-01-01T02:00:00.000Z") val sqlDate = SqlDate.valueOf(LocalDate parse "1991-02-01") val input = Seq(X3(ts, date, sqlDate)) val ds: TypedDataset[X3[Timestamp, Date, SqlDate]] = TypedDataset.create(input) val result1: Seq[(Timestamp, Date, SqlDate)] = ds.dataset.toDF .collect() .map { row => Tuple3( row.getTimestamp(0), Date.from(row.getTimestamp(1).toInstant), row.getDate(2) ) } .toSeq result1 shouldEqual Seq(Tuple3(ts, date, sqlDate)) val result2: Seq[X3[Timestamp, Date, SqlDate]] = ds.collect.run().toSeq result2 shouldEqual input } test("asCol") { def prop[A: TypedEncoder, B: TypedEncoder](a: Seq[X2[A, B]]) = { val ds: TypedDataset[X2[A, B]] = TypedDataset.create(a) val frameless: Seq[(A, X2[A, B], X2[A, B], X2[A, B], B)] = ds.select(ds('a), ds.asCol, ds.asCol, ds.asCol, ds('b)).collect().run() val scala: Seq[(A, X2[A, B], X2[A, B], X2[A, B], B)] = a.map(x => (x.a, x, x, x, x.b)) scala ?= frameless } check(forAll(prop[Int, Option[Long]] _)) check(forAll(prop[Vector[Char], Option[Boolean]] _)) check(forAll(prop[Vector[Vector[String]], Vector[Vector[BigDecimal]]] _)) } test("asCol single column TypedDatasets") { def prop[A: TypedEncoder](a: Seq[A]) = { val ds: TypedDataset[A] = TypedDataset.create(a) val frameless: Seq[(A, A, A)] = ds.select(ds.asCol, ds.asCol, ds.asCol).collect().run() val scala: Seq[(A, A, A)] = a.map(x => (x, x, x)) scala ?= frameless } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) check(forAll(prop[Date] _)) check(forAll(prop[Vector[Vector[String]]] _)) } test("asCol with numeric operators") { def prop(a: Seq[Long]) = { val ds: TypedDataset[Long] = TypedDataset.create(a) val (first, second) = (2L, 5L) val frameless: Seq[(Long, Long, Long)] = ds.select(ds.asCol, ds.asCol + first, ds.asCol * second).collect().run() val scala: Seq[(Long, Long, Long)] = a.map(x => (x, x + first, x * second)) scala ?= frameless } check(forAll(prop _)) } test("reference Value class so can join on") { import RecordEncoderTests.{ Name, Person } val bar = new Name("bar") val ds1: TypedDataset[Person] = TypedDataset.create(Seq(Person(bar, 23), Person(new Name("foo"), 11))) val ds2: TypedDataset[Name] = TypedDataset.create(Seq(new Name("lorem"), bar)) val joined = ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue) joined.collect().run() shouldEqual Seq(Person(bar, 23)) } test("unary_!") { val ds = TypedDataset.create((true, false) :: Nil) val rs = ds.select(!ds('_1), !ds('_2)).collect().run().head rs shouldEqual (false -> true) } test("unary_! with non-boolean columns should not compile") { val ds = TypedDataset.create((1, "a", 2.0) :: Nil) "ds.select(!ds('_1))" shouldNot typeCheck "ds.select(!ds('_2))" shouldNot typeCheck "ds.select(!ds('_3))" shouldNot typeCheck } test("opt") { val data = (Option(1L), Option(2L)) :: (None, None) :: Nil val ds = TypedDataset.create(data) val rs = ds.select(ds('_1).opt.map(_ * 2), ds('_1).opt.map(_ + 2)).collect().run() val expected = data.map { case (x, y) => (x.map(_ * 2), y.map(_ + 1)) } rs shouldEqual expected } test("opt compiles only for columns of type Option[_]") { val ds = TypedDataset.create((1, List(1, 2, 3)) :: Nil) "ds.select(ds('_1).opt.map(x => x))" shouldNot typeCheck "ds.select(ds('_2).opt.map(x => x))" shouldNot typeCheck } test("field") { val ds = TypedDataset.create((1, (2.3F, "a")) :: Nil) val rs = ds.select(ds('_2).field('_2)).collect().run() rs shouldEqual Seq("a") } test("field compiles only for valid field") { val ds = TypedDataset.create((1, (2.3F, "a")) :: Nil) "ds.select(ds('_2).field('_3))" shouldNot typeCheck } test("col through lambda") { case class MyClass1(a: Int, b: String, c: MyClass2) case class MyClass2(d: Long) val ds = TypedDataset.create( Seq(MyClass1(1, "2", MyClass2(3L)), MyClass1(4, "5", MyClass2(6L))) ) assert(ds.col(_.a).isInstanceOf[TypedColumn[MyClass1, Int]]) assert(ds.col(_.b).isInstanceOf[TypedColumn[MyClass1, String]]) assert(ds.col(_.c.d).isInstanceOf[TypedColumn[MyClass1, Long]]) "ds.col(_.c.toString)" shouldNot typeCheck "ds.col(_.c.toInt)" shouldNot typeCheck "ds.col(x => java.lang.Math.abs(x.a))" shouldNot typeCheck // we should be able to block the following as well... "ds.col(_.a.toInt)" shouldNot typeCheck } } ================================================ FILE: dataset/src/test/scala/frameless/ColumnViaLambdaTests.scala ================================================ package frameless import org.scalatest.matchers.should.Matchers import shapeless.test.illTyped case class MyClass1(a: Int, b: String, c: MyClass2, g: Option[MyClass4]) case class MyClass2(d: Long, e: MyClass3) case class MyClass3(f: Double) case class MyClass4(h: Boolean) final class ColumnViaLambdaTests extends TypedDatasetSuite with Matchers { def ds = { TypedDataset.create(Seq( MyClass1(1, "2", MyClass2(3L, MyClass3(7.0D)), Some(MyClass4(true))), MyClass1(4, "5", MyClass2(6L, MyClass3(8.0D)), None))) } test("col(_.a)") { val col = TypedColumn[MyClass1, Int](_.a) ds.select(col).collect().run() shouldEqual Seq(1, 4) } test("col(x => x.a") { val col = TypedColumn[MyClass1, Int](x => x.a) ds.select(col).collect().run() shouldEqual Seq(1, 4) } test("col((x: MyClass1) => x.a") { val col = TypedColumn { (x: MyClass1) => x.a } ds.select(col).collect().run() shouldEqual Seq(1, 4) } test("col((x: MyClass1) => x.c.e.f") { val col = TypedColumn { (x: MyClass1) => x.c.e.f } ds.select(col).collect().run() shouldEqual Seq(7.0D, 8.0D) } test("col(_.c.d)") { val col = TypedColumn[MyClass1, Long](_.c.d) ds.select(col).collect().run() shouldEqual Seq(3L, 6L) } test("col(_.c.e.f)") { val col = TypedColumn[MyClass1, Double](_.c.e.f) ds.select(col).collect().run() shouldEqual Seq(7.0D, 8.0D) } test("col(_.c.d) as int does not compile (is long)") { illTyped("TypedColumn[MyClass1, Int](_.c.d)") } test("col(_.g.h does not compile") { val col = ds.col(_.g) // the path "ends" at .g (can't access h) illTyped("""ds.col(_.g.h)""") } test("col(_.a.toString) does not compile") { illTyped("""ds.col(_.a.toString)""") } test("col(_.a.toString.size) does not compile") { illTyped("""ds.col(_.a.toString.size)""") } test("col((x: MyClass1) => x.toString.size) does not compile") { illTyped("""ds.col((x: MyClass1) => x.toString.size)""") } test("col(x => java.lang.Math.abs(x.a)) does not compile") { illTyped("""col(x => java.lang.Math.abs(x.a))""") } } ================================================ FILE: dataset/src/test/scala/frameless/CreateTests.scala ================================================ package frameless import org.scalacheck.{Arbitrary, Prop} import org.scalacheck.Prop._ import scala.reflect.ClassTag import shapeless.test.illTyped import org.scalatest.matchers.should.Matchers class CreateTests extends TypedDatasetSuite with Matchers { import TypedEncoder.usingInjection test("creation using X4 derived DataFrames") { def prop[ A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = { val ds = TypedDataset.create(data) TypedDataset.createUnsafe[X4[A, B, C, D]](ds.toDF()).collect().run() ?= data } check(forAll(prop[Int, Char, X2[Option[Country], Country], Int] _)) check(forAll(prop[X2[Int, Int], Int, Boolean, Vector[Food]] _)) check(forAll(prop[String, Food, X3[Food, Country, Boolean], Int] _)) check(forAll(prop[String, Food, X3U[Food, Country, Boolean], Int] _)) check(forAll(prop[ Option[Vector[Food]], Vector[Vector[X2[Vector[(Person, X1[Char])], Country]]], X3[Food, Country, String], Vector[(Food, Country)]] _)) } test("array fields") { def prop[T: Arbitrary: TypedEncoder: ClassTag] = forAll { (d1: Array[T], d2: Array[Option[T]], d3: Array[X1[T]], d4: Array[X1[Option[T]]], d5: X1[Array[T]]) => TypedDataset.create(Seq(d1)).collect().run().head.sameElements(d1) && TypedDataset.create(Seq(d2)).collect().run().head.sameElements(d2) && TypedDataset.create(Seq(d3)).collect().run().head.sameElements(d3) && TypedDataset.create(Seq(d4)).collect().run().head.sameElements(d4) && TypedDataset.create(Seq(d5)).collect().run().head.a.sameElements(d5.a) } check(prop[Boolean]) check(prop[Byte]) check(prop[Short]) check(prop[Int]) check(prop[Long]) check(prop[Float]) check(prop[Double]) check(prop[String]) } test("vector fields") { def prop[T: Arbitrary: TypedEncoder] = forAll { (d1: Vector[T], d2: Vector[Option[T]], d3: Vector[X1[T]], d4: Vector[X1[Option[T]]], d5: X1[Vector[T]]) => (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) && (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) && (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) && (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) && (TypedDataset.create(Seq(d5)).collect().run().head ?= d5) } check(prop[Boolean]) check(prop[Byte]) check(prop[Char]) check(prop[Short]) check(prop[Int]) check(prop[Long]) check(prop[Float]) check(prop[Double]) check(prop[String]) } test("list fields") { def prop[T: Arbitrary: TypedEncoder] = forAll { (d1: List[T], d2: List[Option[T]], d3: List[X1[T]], d4: List[X1[Option[T]]], d5: X1[List[T]]) => (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) && (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) && (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) && (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) && (TypedDataset.create(Seq(d5)).collect().run().head ?= d5) } check(prop[Boolean]) check(prop[Byte]) check(prop[Char]) check(prop[Short]) check(prop[Int]) check(prop[Long]) check(prop[Float]) check(prop[Double]) check(prop[String]) } test("Map fields (scala.Predef.Map / scala.collection.immutable.Map)") { def prop[A: Arbitrary: NotCatalystNullable: TypedEncoder, B: Arbitrary: NotCatalystNullable: TypedEncoder] = forAll { (d1: Map[A, B], d2: Map[B, A], d3: Map[A, Option[B]], d4: Map[A, X1[B]], d5: Map[X1[A], B], d6: Map[X1[A], X1[B]]) => (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) && (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) && (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) && (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) && (TypedDataset.create(Seq(d5)).collect().run().head ?= d5) && (TypedDataset.create(Seq(d6)).collect().run().head ?= d6) } check(prop[String, String]) check(prop[String, Boolean]) check(prop[String, Byte]) check(prop[String, Char]) check(prop[String, Short]) check(prop[String, Int]) check(prop[String, Long]) check(prop[String, Float]) check(prop[String, Double]) } test("maps with Option keys should not resolve the TypedEncoder") { val data: Seq[Map[Option[Int], Int]] = Seq(Map(Some(5) -> 5)) illTyped("TypedDataset.create(data)", ".*could not find implicit value for parameter encoder.*") } test("not aligned columns should throw an exception") { val v = Vector(X2(1,2)) val df = TypedDataset.create(v).dataset.toDF() a [IllegalStateException] should be thrownBy { TypedDataset.createUnsafe[X1[Int]](df).show().run() } } test("dataset with different column order") { // e.g. when loading data from partitioned dataset // the partition columns get appended to the end of the underlying relation def prop[A: Arbitrary: TypedEncoder, B: Arbitrary: TypedEncoder] = forAll { (a1: A, b1: B) => { val ds = TypedDataset.create( Vector((b1, a1)) ).dataset.toDF("b", "a").as[X2[A, B]](TypedExpressionEncoder[X2[A, B]]) TypedDataset.create(ds).collect().run().head ?= X2(a1, b1) } } check(prop[X1[Double], X1[X1[SQLDate]]]) check(prop[String, Int]) } } ================================================ FILE: dataset/src/test/scala/frameless/DropTest.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped class DropTest extends TypedDatasetSuite { import DropTest._ test("fail to compile on missing value") { val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil) illTyped { """val fNew: TypedDataset[XMissing] = f.drop[XMissing]('j)""" } } test("fail to compile on different column name") { val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil) illTyped { """val fNew: TypedDataset[XDifferentColumnName] = f.drop[XDifferentColumnName]('j)""" } } test("fail to compile on added column name") { val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil) illTyped { """val fNew: TypedDataset[XAdded] = f.drop[XAdded]('j)""" } } test("remove column in the middle") { val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil) val fNew: TypedDataset[XGood] = f.drop[XGood] fNew.collect().run().foreach(xg => assert(xg === XGood(1, false))) } test("drop four columns") { def prop[A: TypedEncoder](value: A): Prop = { val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil) val d4 = d5.drop[X4[A, A, A, A]] val d3 = d4.drop[X3[A, A, A]] val d2 = d3.drop[X2[A, A]] val d1 = d2.drop[X1[A]] X1(value) ?= d1.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } } object DropTest { case class X(i: Int, j: Int, k: Boolean) case class XMissing(i: Int) case class XDifferentColumnName(ij: Int, k: Boolean) case class XAdded(i: Int, j: Int, k: Boolean, l: Int) case class XGood(i: Int, k: Boolean) } ================================================ FILE: dataset/src/test/scala/frameless/DropTupledTest.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class DropTupledTest extends TypedDatasetSuite { test("drop five columns") { def prop[A: TypedEncoder](value: A): Prop = { val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil) val d4 = d5.dropTupled('a) //drops first column val d3 = d4.dropTupled('_4) //drops last column val d2 = d3.dropTupled('_2) //drops middle column val d1 = d2.dropTupled('_2) Tuple1(value) ?= d1.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } test("drop first column") { def prop[A: TypedEncoder](value: A): Prop = { val d3 = TypedDataset.create(X3(value, value, value) :: Nil) val d2 = d3.dropTupled('a) (value, value) ?= d2.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } test("drop middle column") { def prop[A: TypedEncoder](value: A): Prop = { val d3 = TypedDataset.create(X3(value, value, value) :: Nil) val d2 = d3.dropTupled('b) (value, value) ?= d2.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } test("drop last column") { def prop[A: TypedEncoder](value: A): Prop = { val d3 = TypedDataset.create(X3(value, value, value) :: Nil) val d2 = d3.dropTupled('c) (value, value) ?= d2.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } } ================================================ FILE: dataset/src/test/scala/frameless/EncoderTests.scala ================================================ package frameless import scala.collection.immutable.Set import org.scalatest.matchers.should.Matchers object EncoderTests { case class Foo(s: Seq[(Int, Int)]) case class Bar(s: Set[(Int, Int)]) case class InstantRow(i: java.time.Instant) case class DurationRow(d: java.time.Duration) case class PeriodRow(p: java.time.Period) } class EncoderTests extends TypedDatasetSuite with Matchers { import EncoderTests._ test("It should encode deeply nested collections") { implicitly[TypedEncoder[Seq[Foo]]] implicitly[TypedEncoder[Seq[Bar]]] implicitly[TypedEncoder[Set[Foo]]] } test("It should encode java.time.Instant") { implicitly[TypedEncoder[InstantRow]] } test("It should encode java.time.Duration") { implicitly[TypedEncoder[DurationRow]] } test("It should encode java.time.Period") { implicitly[TypedEncoder[PeriodRow]] } } ================================================ FILE: dataset/src/test/scala/frameless/ExplodeTests.scala ================================================ package frameless import frameless.functions.CatalystExplodableCollection import org.scalacheck.{Arbitrary, Prop} import org.scalacheck.Prop.forAll import org.scalacheck.Prop._ import scala.reflect.ClassTag class ExplodeTests extends TypedDatasetSuite { test("simple explode test") { val ds = TypedDataset.create(Seq((1,Array(1,2)))) ds.explode('_2): TypedDataset[(Int,Int)] } test("explode on vectors/list/seq") { def prop[F[X] <: Traversable[X] : CatalystExplodableCollection, A: TypedEncoder](xs: List[X1[F[A]]])(implicit arb: Arbitrary[F[A]], enc: TypedEncoder[F[A]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.explode('a).collect().run().toVector val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector framelessResults ?= scalaResults } check(forAll(prop[Vector, Long] _)) check(forAll(prop[Seq, Int] _)) check(forAll(prop[Vector, Char] _)) check(forAll(prop[Vector, String] _)) check(forAll(prop[List, Long] _)) check(forAll(prop[List, Int] _)) check(forAll(prop[List, Char] _)) check(forAll(prop[List, String] _)) } test("explode on arrays") { def prop[A: TypedEncoder: ClassTag](xs: List[X1[Array[A]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.explode('a).collect().run().toVector val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector framelessResults ?= scalaResults } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("explode on maps") { def prop[A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X1[Map[A, B]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.explodeMap('a).collect().run().toVector val scalaResults = xs.flatMap(_.a.toList).map(t => Tuple1(Tuple2(t._1, t._2))).toVector framelessResults ?= scalaResults } check(forAll(prop[Long, String] _)) check(forAll(prop[Int, Long] _)) check(forAll(prop[String, Int] _)) } test("explode on maps preserving other columns") { def prop[K: TypedEncoder: ClassTag, A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X2[K, Map[A, B]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.explodeMap('b).collect().run().toVector val scalaResults = xs.flatMap { x2 => x2.b.toList.map((x2.a, _)) }.toVector framelessResults ?= scalaResults } check(forAll(prop[Int, Long, String] _)) check(forAll(prop[String, Int, Long] _)) check(forAll(prop[Long, String, Int] _)) } test("explode on maps making sure no key / value naming collision happens") { def prop[K: TypedEncoder: ClassTag, V: TypedEncoder: ClassTag, A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X3KV[K, V, Map[A, B]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.explodeMap('c).collect().run().toVector val scalaResults = xs.flatMap { x3 => x3.c.toList.map((x3.key, x3.value, _)) }.toVector framelessResults ?= scalaResults } check(forAll(prop[String, Int, Long, String] _)) check(forAll(prop[Long, String, Int, Long] _)) check(forAll(prop[Int, Long, String, Int] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/FilterTests.scala ================================================ package frameless import org.scalatest.matchers.should.Matchers import org.scalacheck.Prop import org.scalacheck.Prop._ final class FilterTests extends TypedDatasetSuite with Matchers { test("filter('a == lit(b))") { def prop[A: TypedEncoder](elem: A, data: Vector[X1[A]])(implicit ex1: TypedEncoder[X1[A]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col('a) val dataset2 = dataset.filter(A === elem).collect().run().toVector val data2 = data.filter(_.a == elem) dataset2 ?= data2 } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("filter('a =!= lit(b))") { def prop[A: TypedEncoder](elem: A, data: Vector[X1[A]])(implicit ex1: TypedEncoder[X1[A]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col('a) val dataset2 = dataset.filter(A =!= elem).collect().run().toVector val data2 = data.filter(_.a != elem) dataset2 ?= data2 } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) check(forAll(prop[Char] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[Vector[SQLTimestamp]] _)) } test("filter('a =!= 'b)") { def prop[A: TypedEncoder](data: Vector[X2[A, A]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col('a) val B = dataset.col('b) val dataset2 = dataset.filter(A =!= B).collect().run().toVector val data2 = data.filter(x => x.a != x.b) dataset2 ?= data2 } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) check(forAll(prop[Char] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[Vector[SQLTimestamp]] _)) } test("filter('a =!= 'b") { def prop[A: TypedEncoder](elem: A, data: Vector[X2[A,A]]): Prop = { val dataset = TypedDataset.create(data) val cA = dataset.col('a) val cB = dataset.col('b) val dataset2 = dataset.filter(cA =!= cB).collect().run().toVector val data2 = data.filter(x => x.a != x.b ) (dataset2 ?= data2).&&(dataset.filter(cA =!= cA).count().run() ?= 0) } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) check(forAll(prop[Char] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[Vector[SQLTimestamp]] _)) } test("filter with arithmetic expressions: addition") { check(forAll { (data: Vector[X1[Int]]) => val ds = TypedDataset.create(data) val res = ds.filter((ds('a) + 1) === (ds('a) + 1)).collect().run().toVector res ?= data }) } test("filter with values (not columns): addition") { check(forAll { (data: Vector[X1[Int]], const: Int) => val ds = TypedDataset.create(data) val res = ds.filter(ds('a) > const).collect().run().toVector res ?= data.filter(_.a > const) }) } test("filter with arithmetic expressions: multiplication") { val t = X1(1) :: X1(2) :: X1(3) :: Nil val tds: TypedDataset[X1[Int]] = TypedDataset.create(t) assert(tds.filter(tds('a) * 2 === 2).collect().run().toVector === Vector(X1(1))) assert(tds.filter(tds('a) * 3 === 3).collect().run().toVector === Vector(X1(1))) } test("Option equality/inequality for columns") { def prop[A <: Option[_] : TypedEncoder](a: A, b: A): Prop = { val data = X2(a, b) :: X2(a, a) :: Nil val dataset = TypedDataset.create(data) val A = dataset.col('a) val B = dataset.col('b) (data.filter(x => x.a == x.b).toSet ?= dataset.filter(A === B).collect().run().toSet). &&(data.filter(x => x.a != x.b).toSet ?= dataset.filter(A =!= B).collect().run().toSet). &&(data.filter(x => x.a == None).toSet ?= dataset.filter(A.isNone).collect().run().toSet). &&(data.filter(x => x.a == None).toSet ?= dataset.filter(A.isNotNone === false).collect().run().toSet) } check(forAll(prop[Option[Int]] _)) check(forAll(prop[Option[Boolean]] _)) check(forAll(prop[Option[SQLDate]] _)) check(forAll(prop[Option[SQLTimestamp]] _)) check(forAll(prop[Option[X1[String]]] _)) check(forAll(prop[Option[X1[X1[String]]]] _)) check(forAll(prop[Option[X1[X1[Vector[Option[Int]]]]]] _)) } test("Option equality/inequality for lit") { def prop[A <: Option[_] : TypedEncoder](a: A, b: A, cLit: A): Prop = { val data = X2(a, b) :: X2(a, cLit) :: Nil val dataset = TypedDataset.create(data) val colA = dataset.col('a) (data.filter(x => x.a == cLit).toSet ?= dataset.filter(colA === cLit).collect().run().toSet). &&(data.filter(x => x.a != cLit).toSet ?= dataset.filter(colA =!= cLit).collect().run().toSet). &&(data.filter(x => x.a == None).toSet ?= dataset.filter(colA.isNone).collect().run().toSet). &&(data.filter(x => x.a == None).toSet ?= dataset.filter(colA.isNotNone === false).collect().run().toSet) } check(forAll(prop[Option[Int]] _)) check(forAll(prop[Option[Boolean]] _)) check(forAll(prop[Option[SQLDate]] _)) check(forAll(prop[Option[SQLTimestamp]] _)) check(forAll(prop[Option[String]] _)) check(forAll(prop[Option[X1[String]]] _)) check(forAll(prop[Option[X1[X1[String]]]] _)) check(forAll(prop[Option[X1[X1[Vector[Option[Int]]]]]] _)) } test("Option content filter") { val data = (Option(1L), Option(2L)) :: (Option(0L), Option(1L)) :: (None, None) :: Nil val ds = TypedDataset.create(data) val l = functions.lit[Long, (Option[Long], Option[Long])](0L) val exists = ds('_1).isSome[Long](_ <= l) val forall = ds('_1).isSomeOrNone[Long](_ <= l) ds.select(exists).collect().run() shouldEqual Seq(false, true, false) ds.select(forall).collect().run() shouldEqual Seq(false, true, true) ds.filter(exists).collect().run() shouldEqual Seq(Option(0L) -> Option(1L)) ds.filter(forall).collect().run() shouldEqual Seq( Option(0L) -> Option(1L), (None -> None)) } test("filter with isin values") { def prop[A: TypedEncoder](data: Vector[X1[A]], values: Vector[A])(implicit a : CatalystIsin[A]): Prop = { val ds = TypedDataset.create(data) val res = ds.filter(ds('a).isin(values:_*)).collect().run().toVector res ?= data.filter(d => values.contains(d.a)) } check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) check(forAll(prop[Float] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/FlattenTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop.forAll import org.scalacheck.Prop._ class FlattenTests extends TypedDatasetSuite { test("simple flatten test") { val ds: TypedDataset[(Int,Option[Int])] = TypedDataset.create(Seq((1,Option(1)))) ds.flattenOption('_2): TypedDataset[(Int,Int)] } test("different Optional types") { def prop[A: TypedEncoder](xs: List[X1[Option[A]]]): Prop = { val tds: TypedDataset[X1[Option[A]]] = TypedDataset.create(xs) val framelessResults: Seq[Tuple1[A]] = tds.flattenOption('a).collect().run().toVector val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector framelessResults ?= scalaResults } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[Char] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/GroupByTests.scala ================================================ package frameless import frameless.functions.aggregate._ import org.scalacheck.Prop import org.scalacheck.Prop._ class GroupByTests extends TypedDatasetSuite { test("groupByMany('a).agg(sum('b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric ](data: List[X2[A, B]])( implicit summable: CatalystSummable[B, Out], widen: B => Out ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val datasetSumByA = dataset.groupByMany(A).agg(sum(B)).collect().run.toVector.sortBy(_._1) val sumByA = data.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).map(widen).sum }.toVector.sortBy(_._1) datasetSumByA ?= sumByA } check(forAll(prop[Int, Long, Long] _)) } test("agg(sum('a))") { def prop[A: TypedEncoder : Numeric](data: List[X1[A]])( implicit summable: CatalystSummable[A, A] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetSum = dataset.agg(sum(A)).collect().run().toVector val listSum = data.map(_.a).sum datasetSum ?= Vector(listSum) } check(forAll(prop[Long] _)) } test("agg(sum('a), sum('b))") { def prop[ A: TypedEncoder : Numeric, B: TypedEncoder : Numeric ](data: List[X2[A, B]])( implicit as: CatalystSummable[A, A], bs: CatalystSummable[B, B] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val datasetSum = dataset.agg(sum(A), sum(B)).collect().run().toVector val listSumA = data.map(_.a).sum val listSumB = data.map(_.b).sum datasetSum ?= Vector((listSumA, listSumB)) } check(forAll(prop[Long, Long] _)) } test("agg(sum('a), sum('b), sum('c))") { def prop[ A: TypedEncoder : Numeric, B: TypedEncoder : Numeric, C: TypedEncoder : Numeric ](data: List[X3[A, B, C]])( implicit as: CatalystSummable[A, A], bs: CatalystSummable[B, B], cs: CatalystSummable[C, C] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val datasetSum = dataset.agg(sum(A), sum(B), sum(C)).collect().run().toVector val listSumA = data.map(_.a).sum val listSumB = data.map(_.b).sum val listSumC = data.map(_.c).sum datasetSum ?= Vector((listSumA, listSumB, listSumC)) } check(forAll(prop[Long, Long, Long] _)) } test("agg(sum('a), sum('b), min('c), max('d))") { def prop[ A: TypedEncoder : Numeric, B: TypedEncoder : Numeric, C: TypedEncoder : Numeric, D: TypedEncoder : Numeric ](data: List[X4[A, B, C, D]])( implicit as: CatalystSummable[A, A], bs: CatalystSummable[B, B], co: CatalystOrdered[C], fo: CatalystOrdered[D] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val datasetSum = dataset.agg(sum(A), sum(B), min(C), max(D)).collect().run().toVector val listSumA = data.map(_.a).sum val listSumB = data.map(_.b).sum val listMinC = if(data.isEmpty) implicitly[Numeric[C]].fromInt(0) else data.map(_.c).min val listMaxD = if(data.isEmpty) implicitly[Numeric[D]].fromInt(0) else data.map(_.d).max datasetSum ?= Vector(if (data.isEmpty) null else (listSumA, listSumB, listMinC, listMaxD)) } check(forAll(prop[Long, Long, Long, Int] _)) check(forAll(prop[Long, Long, Short, Short] _)) check(forAll(prop[Long, Long, Double, BigDecimal] _)) } test("groupBy('a).agg(sum('b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric ](data: List[X2[A, B]])( implicit summable: CatalystSummable[B, Out], widen: B => Out ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val datasetSumByA = dataset.groupBy(A).agg(sum(B)).collect().run.toVector.sortBy(_._1) val sumByA = data.groupBy(_.a).mapValues(_.map(_.b).map(widen).sum).toVector.sortBy(_._1) datasetSumByA ?= sumByA } check(forAll(prop[Int, Long, Long] _)) } test("groupBy('a).mapGroups('a, sum('b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Numeric ](data: List[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val datasetSumByA = dataset.groupBy(A) .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) } .collect().run().toVector.sortBy(_._1) val sumByA = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1) datasetSumByA ?= sumByA } check(forAll(prop[Int, Long] _)) } test("groupBy('a).agg(sum('b), sum('c)) to groupBy('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder, C: TypedEncoder, OutB: TypedEncoder : Numeric, OutC: TypedEncoder : Numeric ](data: List[X3[A, B, C]])( implicit summableB: CatalystSummable[B, OutB], summableC: CatalystSummable[C, OutC], widenb: B => OutB, widenc: C => OutC ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumBC = dataset .groupBy(A) .agg(sum(B), sum(C)) .collect().run.toVector.sortBy(_._1) val scalaSumBC = data.groupBy(_.a).mapValues { xs => (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum) }.toVector.map { case (a, (b, c)) => (a, b, c) }.sortBy(_._1) val framelessSumBCB = dataset .groupBy(A) .agg(sum(B), sum(C), sum(B)) .collect().run.toVector.sortBy(_._1) val scalaSumBCB = data.groupBy(_.a).mapValues { xs => (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum) }.toVector.map { case (a, (b1, c, b2)) => (a, b1, c, b2) }.sortBy(_._1) val framelessSumBCBC = dataset .groupBy(A) .agg(sum(B), sum(C), sum(B), sum(C)) .collect().run.toVector.sortBy(_._1) val scalaSumBCBC = data.groupBy(_.a).mapValues { xs => (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum) }.toVector.map { case (a, (b1, c1, b2, c2)) => (a, b1, c1, b2, c2) }.sortBy(_._1) val framelessSumBCBCB = dataset .groupBy(A) .agg(sum(B), sum(C), sum(B), sum(C), sum(B)) .collect().run.toVector.sortBy(_._1) val scalaSumBCBCB = data.groupBy(_.a).mapValues { xs => (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum) }.toVector.map { case (a, (b1, c1, b2, c2, b3)) => (a, b1, c1, b2, c2, b3) }.sortBy(_._1) (framelessSumBC ?= scalaSumBC) .&&(framelessSumBCB ?= scalaSumBCB) .&&(framelessSumBCBC ?= scalaSumBCBC) .&&(framelessSumBCBCB ?= scalaSumBCBCB) } check(forAll(prop[String, Long, BigDecimal, Long, BigDecimal] _)) } test("groupBy('a, 'b).agg(sum('c)) to groupBy('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, OutC: TypedEncoder: Numeric ](data: List[X3[A, B, C]])( implicit summableC: CatalystSummable[C, OutC], widenc: C => OutC ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumC = dataset .groupBy(A,B) .agg(sum(C)) .collect().run.toVector.sortBy(x => (x._1,x._2)) val scalaSumC = data.groupBy(x => (x.a,x.b)).mapValues { xs => xs.map(_.c).map(widenc).sum }.toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1,x._2)) val framelessSumCC = dataset .groupBy(A,B) .agg(sum(C), sum(C)) .collect().run.toVector.sortBy(x => (x._1,x._2)) val scalaSumCC = data.groupBy(x => (x.a,x.b)).mapValues { xs => val s = xs.map(_.c).map(widenc).sum; (s,s) }.toVector.map { case ((a, b), (c1, c2)) => (a, b, c1, c2) }.sortBy(x => (x._1,x._2)) val framelessSumCCC = dataset .groupBy(A,B) .agg(sum(C), sum(C), sum(C)) .collect().run.toVector.sortBy(x => (x._1,x._2)) val scalaSumCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs => val s = xs.map(_.c).map(widenc).sum; (s,s,s) }.toVector.map { case ((a, b), (c1, c2, c3)) => (a, b, c1, c2, c3) }.sortBy(x => (x._1,x._2)) val framelessSumCCCC = dataset .groupBy(A,B) .agg(sum(C), sum(C), sum(C), sum(C)) .collect().run.toVector.sortBy(x => (x._1,x._2)) val scalaSumCCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs => val s = xs.map(_.c).map(widenc).sum; (s,s,s,s) }.toVector.map { case ((a, b), (c1, c2, c3, c4)) => (a, b, c1, c2, c3, c4) }.sortBy(x => (x._1,x._2)) val framelessSumCCCCC = dataset .groupBy(A,B) .agg(sum(C), sum(C), sum(C), sum(C), sum(C)) .collect().run.toVector.sortBy(x => (x._1,x._2)) val scalaSumCCCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs => val s = xs.map(_.c).map(widenc).sum; (s,s,s,s,s) }.toVector.map { case ((a, b), (c1, c2, c3, c4, c5)) => (a, b, c1, c2, c3, c4, c5) }.sortBy(x => (x._1,x._2)) (framelessSumC ?= scalaSumC) && (framelessSumCC ?= scalaSumCC) && (framelessSumCCC ?= scalaSumCCC) && (framelessSumCCCC ?= scalaSumCCCC) && (framelessSumCCCCC ?= scalaSumCCCCC) } check(forAll(prop[String, Long, BigDecimal, BigDecimal] _)) } test("groupBy('a, 'b).agg(sum('c), sum('d))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, D: TypedEncoder, OutC: TypedEncoder : Numeric, OutD: TypedEncoder : Numeric ](data: List[X4[A, B, C, D]])( implicit summableC: CatalystSummable[C, OutC], summableD: CatalystSummable[D, OutD], widenc: C => OutC, widend: D => OutD ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val datasetSumByAB = dataset .groupBy(A, B) .agg(sum(C), sum(D)) .collect().run.toVector.sortBy(x => (x._1, x._2)) val sumByAB = data.groupBy(x => (x.a, x.b)).mapValues { xs => (xs.map(_.c).map(widenc).sum, xs.map(_.d).map(widend).sum) }.toVector.map { case ((a, b), (c, d)) => (a, b, c, d) }.sortBy(x => (x._1, x._2)) datasetSumByAB ?= sumByAB } check(forAll(prop[Byte, Int, Long, BigDecimal, Long, BigDecimal] _)) } test("groupBy('a, 'b).mapGroups('a, 'b, sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Numeric ](data: List[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val datasetSumByAB = dataset .groupBy(A, B) .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) } .collect().run().toVector.sortBy(x => (x._1, x._2)) val sumByAB = data.groupBy(x => (x.a, x.b)) .mapValues { xs => xs.map(_.c).sum } .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2)) datasetSumByAB ?= sumByAB } check(forAll(prop[Byte, Int, Long] _)) } test("groupBy('a).mapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder: Ordering, B: TypedEncoder: Ordering ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .groupBy(A) .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted)) .collect().run.toMap val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted } datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("groupBy('a).flatMapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .groupBy(A) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run .sorted val dataGrouped = data .groupBy(_.a).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("groupBy('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Ordering ](data: Vector[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val cA = dataset.col[A]('a) val cB = dataset.col[B]('b) val datasetGrouped = dataset .groupBy(cA, cB) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run() .sorted val dataGrouped = data .groupBy(t => (t.a,t.b)).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short], Long] _)) check(forAll(prop[Option[Short], Short, Int] _)) check(forAll(prop[X1[Option[Short]], Short, Byte] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/InjectionTests.scala ================================================ package frameless import frameless.CollectTests.prop import org.scalacheck._ import org.scalacheck.Prop._ import shapeless.test.illTyped sealed trait Country case object France extends Country case object Russia extends Country object Country { implicit val arbitrary: Arbitrary[Country] = Arbitrary(Arbitrary.arbitrary[Boolean].map(injection.invert)) implicit val injection: Injection[Country, Boolean] = Injection(France.==, if (_) France else Russia) } sealed trait Food case object Burger extends Food case object Pasta extends Food case object Rice extends Food object Food { implicit val arbitrary: Arbitrary[Food] = Arbitrary(Arbitrary.arbitrary[Int].map(i => injection.invert(Math.abs(i % 3)))) implicit val injection: Injection[Food, Int] = Injection( { case Burger => 0 case Pasta => 1 case Rice => 2 }, { case 0 => Burger case 1 => Pasta case 2 => Rice } ) } // Supposingly coming from a java lib, shapeless can't derive stuff for this one :( class LocalDateTime { var instant: Long = _ override def equals(o: Any): Boolean = o.isInstanceOf[LocalDateTime] && o.asInstanceOf[LocalDateTime].instant == instant } object LocalDateTime { implicit val arbitrary: Arbitrary[LocalDateTime] = Arbitrary(Arbitrary.arbitrary[Long].map(injection.invert)) implicit val injection: Injection[LocalDateTime, Long] = Injection( _.instant, long => { val ldt = new LocalDateTime; ldt.instant = long; ldt } ) } case class Person(age: Int, name: String) object Person { val tupled = (Person.apply _).tupled implicit val arbitrary: Arbitrary[Person] = Arbitrary(Arbitrary.arbTuple2[Int, String].arbitrary.map(tupled)) implicit val injection: Injection[Person, (Int, String)] = Injection(p => unapply(p).get, tupled) } case class I[A](value: A) object I { implicit def injection[A]: Injection[I[A], A] = Injection(_.value, I(_)) implicit def typedEncoder[A: TypedEncoder]: TypedEncoder[I[A]] = TypedEncoder.usingInjection[I[A], A] implicit def arbitrary[A: Arbitrary]: Arbitrary[I[A]] = Arbitrary(Arbitrary.arbitrary[A].map(I(_))) } sealed trait Employee case object Casual extends Employee case object PartTime extends Employee case object FullTime extends Employee object Employee { implicit val arbitrary: Arbitrary[Employee] = Arbitrary(Gen.oneOf(Casual, PartTime, FullTime)) } sealed trait Maybe case object Nothing extends Maybe case class Just(get: Int) extends Maybe sealed trait Switch object Switch { case object Off extends Switch case object On extends Switch implicit val arbitrary: Arbitrary[Switch] = Arbitrary(Gen.oneOf(Off, On)) } sealed trait Pixel case class Red() extends Pixel case class Green() extends Pixel case class Blue() extends Pixel object Pixel { implicit val arbitrary: Arbitrary[Pixel] = Arbitrary(Gen.oneOf(Red(), Green(), Blue())) } sealed trait Connection[+A] case object Closed extends Connection[Nothing] case object Open extends Connection[Nothing] object Connection { implicit def arbitrary[A]: Arbitrary[Connection[A]] = Arbitrary(Gen.oneOf(Closed, Open)) } sealed abstract class Vehicle(colour: String) case object Car extends Vehicle("red") case object Bike extends Vehicle("blue") object Vehicle { implicit val arbitrary: Arbitrary[Vehicle] = Arbitrary(Gen.oneOf(Car, Bike)) } class InjectionTests extends TypedDatasetSuite { test("Injection based encoders") { check(forAll(prop[Country] _)) check(forAll(prop[LocalDateTime] _)) check(forAll(prop[Food] _)) check(forAll(prop[X1[Country]] _)) check(forAll(prop[X1[LocalDateTime]] _)) check(forAll(prop[X1[Food]] _)) check(forAll(prop[X1[X1[Country]]] _)) check(forAll(prop[X1[X1[LocalDateTime]]] _)) check(forAll(prop[X1[X1[Food]]] _)) check(forAll(prop[X2[Country, X2[LocalDateTime, Food]]] _)) check(forAll(prop[X3[Country, LocalDateTime, Food]] _)) check(forAll(prop[X3U[Country, LocalDateTime, Food]] _)) check(forAll(prop[I[Int]] _)) check(forAll(prop[I[Option[Int]]] _)) check(forAll(prop[I[I[Int]]] _)) check(forAll(prop[I[I[Option[Int]]]] _)) check(forAll(prop[I[X1[Int]]] _)) check(forAll(prop[I[I[X1[Int]]]] _)) check(forAll(prop[I[I[Option[X1[Int]]]]] _)) check(forAll(prop[Option[I[Int]]] _)) check(forAll(prop[Option[I[X1[Int]]]] _)) assert(TypedEncoder[I[Int]].catalystRepr == TypedEncoder[Int].catalystRepr) assert(TypedEncoder[I[I[Int]]].catalystRepr == TypedEncoder[Int].catalystRepr) assert(TypedEncoder[I[Option[Int]]].nullable) } test("TypedEncoder[Person] is ambiguous") { illTyped("implicitly[TypedEncoder[Person]]", "ambiguous implicit values.*") } test("Resolve ambiguity by importing usingInjection") { import TypedEncoder.usingInjection check(forAll(prop[X1[Person]] _)) check(forAll(prop[X1[X1[Person]]] _)) check(forAll(prop[X2[Person, Person]] _)) check(forAll(prop[Person] _)) assert(TypedEncoder[Person].catalystRepr == TypedEncoder[(Int, String)].catalystRepr) } test("Resolve ambiguity by importing usingDerivation") { import TypedEncoder.usingDerivation assert(implicitly[TypedEncoder[Person]].isInstanceOf[RecordEncoder[Person, _, _]]) check(forAll(prop[Person] _)) } test("TypedEncoder[Employee] implicit is missing") { illTyped( "implicitly[TypedEncoder[Employee]]", "could not find implicit value for parameter e.*" ) } test("Resolve missing implicit by deriving Injection instance") { import frameless.TypedEncoder.injections._ check(forAll(prop[X1[Employee]] _)) check(forAll(prop[X1[X1[Employee]]] _)) check(forAll(prop[X2[Employee, Employee]] _)) check(forAll(prop[Employee] _)) assert(TypedEncoder[Employee].catalystRepr == TypedEncoder[String].catalystRepr) } test("TypedEncoder[Maybe] cannot be derived") { import frameless.TypedEncoder.injections._ illTyped( "implicitly[TypedEncoder[Maybe]]", "could not find implicit value for parameter e.*" ) } test("Derive encoder for type with data constructors defined in the companion object") { import frameless.TypedEncoder.injections._ check(forAll(prop[X1[Switch]] _)) check(forAll(prop[X1[X1[Switch]]] _)) check(forAll(prop[X2[Switch, Switch]] _)) check(forAll(prop[Switch] _)) assert(TypedEncoder[Switch].catalystRepr == TypedEncoder[String].catalystRepr) } test("Derive encoder for type with data constructors defined as parameterless case classes") { import frameless.TypedEncoder.injections._ check(forAll(prop[X1[Pixel]] _)) check(forAll(prop[X1[X1[Pixel]]] _)) check(forAll(prop[X2[Pixel, Pixel]] _)) check(forAll(prop[Pixel] _)) assert(TypedEncoder[Pixel].catalystRepr == TypedEncoder[String].catalystRepr) } test("Derive encoder for phantom type") { import frameless.TypedEncoder.injections._ check(forAll(prop[X1[Connection[Int]]] _)) check(forAll(prop[X1[X1[Connection[Int]]]] _)) check(forAll(prop[X2[Connection[Int], Connection[Int]]] _)) check(forAll(prop[Connection[Int]] _)) assert(TypedEncoder[Connection[Int]].catalystRepr == TypedEncoder[String].catalystRepr) } test("Derive encoder for ADT with abstract class as the base type") { import frameless.TypedEncoder.injections._ check(forAll(prop[X1[Vehicle]] _)) check(forAll(prop[X1[X1[Vehicle]]] _)) check(forAll(prop[X2[Vehicle, Vehicle]] _)) check(forAll(prop[Vehicle] _)) assert(TypedEncoder[Vehicle].catalystRepr == TypedEncoder[String].catalystRepr) } test("apply method of derived Injection instance produces the correct string") { import frameless.TypedEncoder.injections._ assert(implicitly[Injection[Employee, String]].apply(Casual) === "Casual") assert(implicitly[Injection[Switch, String]].apply(Switch.On) === "On") assert(implicitly[Injection[Pixel, String]].apply(Blue()) === "Blue") assert(implicitly[Injection[Connection[Int], String]].apply(Open) === "Open") assert(implicitly[Injection[Vehicle, String]].apply(Bike) === "Bike") } test("invert method of derived Injection instance produces the correct value") { import frameless.TypedEncoder.injections._ assert(implicitly[Injection[Employee, String]].invert("Casual") === Casual) assert(implicitly[Injection[Switch, String]].invert("On") === Switch.On) assert(implicitly[Injection[Pixel, String]].invert("Blue") === Blue()) assert(implicitly[Injection[Connection[Int], String]].invert("Open") === Open) assert(implicitly[Injection[Vehicle, String]].invert("Bike") === Bike) } test( "invert method of derived Injection instance should throw exception if string does not match data constructor names" ) { import frameless.TypedEncoder.injections._ val caught = intercept[IllegalArgumentException] { implicitly[Injection[Employee, String]].invert("cassual") } assert( caught.getMessage === "Cannot construct a value of type CNil: cassual did not match data constructor names" ) } } ================================================ FILE: dataset/src/test/scala/frameless/IsValueClassTests.scala ================================================ package frameless import shapeless.Refute import shapeless.test.illTyped import org.scalatest.funsuite.AnyFunSuite import org.scalatest.matchers.should.Matchers final class IsValueClassTests extends AnyFunSuite with Matchers { test("Case class is not Value class") { illTyped("IsValueClass[P]") illTyped("IsValueClass[Q]") } test("Scala value type is not Value class (excluded)") { illTyped("implicitly[IsValueClass[Double]]") illTyped("implicitly[IsValueClass[Float]]") illTyped("implicitly[IsValueClass[Long]]") illTyped("implicitly[IsValueClass[Int]]") illTyped("implicitly[IsValueClass[Char]]") illTyped("implicitly[IsValueClass[Short]]") illTyped("implicitly[IsValueClass[Byte]]") illTyped("implicitly[IsValueClass[Unit]]") illTyped("implicitly[IsValueClass[Boolean]]") illTyped("implicitly[IsValueClass[BigDecimal]]") } test("Value class evidence") { implicitly[IsValueClass[RecordEncoderTests.Name]] illTyped("implicitly[Refute[IsValueClass[RecordEncoderTests.Name]]]") } } ================================================ FILE: dataset/src/test/scala/frameless/JobTests.scala ================================================ package frameless import org.scalacheck.Arbitrary import org.scalatest.BeforeAndAfterAll import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks import org.scalatest.freespec.AnyFreeSpec import org.scalatest.matchers.should.Matchers class JobTests extends AnyFreeSpec with BeforeAndAfterAll with SparkTesting with ScalaCheckDrivenPropertyChecks with Matchers { "map" - { "identity" in { def check[T](implicit arb: Arbitrary[T]) = forAll { t: T => Job(t).map(identity).run() shouldEqual Job(t).run() } check[Int] } val f1: Int => Int = _ + 1 val f2: Int => Int = (i: Int) => i * i "composition" in forAll { i: Int => Job(i).map(f1).map(f2).run() shouldEqual Job(i).map(f1 andThen f2).run() } } "flatMap" - { val f1: Int => Job[Int] = (i: Int) => Job(i + 1) val f2: Int => Job[Int] = (i: Int) => Job(i * i) "left identity" in forAll { i: Int => Job(i).flatMap(f1).run() shouldEqual f1(i).run() } "right identity" in forAll { i: Int => Job(i).flatMap(i => Job.apply(i)).run() shouldEqual Job(i).run() } "associativity" in forAll { i: Int => Job(i).flatMap(f1).flatMap(f2).run() shouldEqual Job(i).flatMap(ii => f1(ii).flatMap(f2)).run() } } "properties" - { "read back" in forAll { (k:String, v: String) => val scopedKey = "frameless.tests." + k Job(1).withLocalProperty(scopedKey,v).run() sc.getLocalProperty(scopedKey) shouldBe v } } } ================================================ FILE: dataset/src/test/scala/frameless/JoinTests.scala ================================================ package frameless import org.apache.spark.sql.types.{StructField, StructType} import org.scalacheck.Prop import org.scalacheck.Prop._ class JoinTests extends TypedDatasetSuite { test("ab.joinCross(ac)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val joinedDs = leftDs .joinCross(rightDs) val joinedData = joinedDs.collect().run().toVector.sorted val joined = { for { ab <- left ac <- right } yield (ab, ac) }.toVector val equalSchemas = joinedDs.schema ?= StructType(Seq( StructField("_1", leftDs.schema, nullable = false), StructField("_2", rightDs.schema, nullable = false))) (joined.sorted ?= joinedData) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinFull(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val joinedDs = leftDs .joinFull(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val rightKeys = right.map(_.a).toSet val leftKeys = left.map(_.a).toSet val joined = { for { ab <- left ac <- right if ac.a == ab.a } yield (Some(ab), Some(ac)) }.toVector ++ { for { ab <- left if !rightKeys.contains(ab.a) } yield (Some(ab), None) }.toVector ++ { for { ac <- right if !leftKeys.contains(ac.a) } yield (None, Some(ac)) }.toVector val equalSchemas = joinedDs.schema ?= StructType(Seq( StructField("_1", leftDs.schema, nullable = true), StructField("_2", rightDs.schema, nullable = true))) (joined.sorted ?= joinedData) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinInner(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val joinedDs = leftDs .joinInner(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val joined = { for { ab <- left ac <- right if ac.a == ab.a } yield (ab, ac) }.toVector val equalSchemas = joinedDs.schema ?= StructType(Seq( StructField("_1", leftDs.schema, nullable = false), StructField("_2", rightDs.schema, nullable = false))) (joined.sorted ?= joinedData) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinLeft(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val joinedDs = leftDs .joinLeft(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val rightKeys = right.map(_.a).toSet val joined = { for { ab <- left ac <- right if ac.a == ab.a } yield (ab, Some(ac)) }.toVector ++ { for { ab <- left if !rightKeys.contains(ab.a) } yield (ab, None) }.toVector val equalSchemas = joinedDs.schema ?= StructType(Seq( StructField("_1", leftDs.schema, nullable = false), StructField("_2", rightDs.schema, nullable = true))) (joined.sorted ?= joinedData) && (joinedData.map(_._1).toSet ?= left.toSet) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinLeftAnti(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val rightKeys = right.map(_.a).toSet val joinedDs = leftDs .joinLeftAnti(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val joined = { for { ab <- left if !rightKeys.contains(ab.a) } yield ab }.toVector val equalSchemas = joinedDs.schema ?= leftDs.schema (joined.sorted ?= joinedData) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinLeftSemi(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val rightKeys = right.map(_.a).toSet val joinedDs = leftDs .joinLeftSemi(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val joined = { for { ab <- left if rightKeys.contains(ab.a) } yield ab }.toVector val equalSchemas = joinedDs.schema ?= leftDs.schema (joined.sorted ?= joinedData) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } test("ab.joinRight(ac)(ab.a == ac.a)") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering, C : TypedEncoder : Ordering ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = { val leftDs = TypedDataset.create(left) val rightDs = TypedDataset.create(right) val joinedDs = leftDs .joinRight(rightDs)(leftDs.col('a) === rightDs.col('a)) val joinedData = joinedDs.collect().run().toVector.sorted val leftKeys = left.map(_.a).toSet val joined = { for { ab <- left ac <- right if ac.a == ab.a } yield (Some(ab), ac) }.toVector ++ { for { ac <- right if !leftKeys.contains(ac.a) } yield (None, ac) }.toVector val equalSchemas = joinedDs.schema ?= StructType(Seq( StructField("_1", leftDs.schema, nullable = true), StructField("_2", rightDs.schema, nullable = false))) (joined.sorted ?= joinedData) && (joinedData.map(_._2).toSet ?= right.toSet) && equalSchemas } check(forAll(prop[Int, Long, String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/LitTests.scala ================================================ package frameless import frameless.functions.lit import org.scalatest.matchers.should.Matchers import org.scalacheck.Prop, Prop._ import RecordEncoderTests.Name class LitTests extends TypedDatasetSuite with Matchers { def prop[A: TypedEncoder](value: A)(implicit i0: shapeless.Refute[IsValueClass[A]]): Prop = { val df: TypedDataset[Int] = TypedDataset.create(1 :: Nil) val l: TypedColumn[Int, A] = lit(value) // filter forces whole codegen val elems = df.deserialized.filter((_:Int) => true).select(l) .collect() .run() .toVector // otherwise it uses local relation val localElems = df.select(l) .collect() .run() .toVector val expected = Vector(value) (localElems ?= expected) && (elems ?= expected) } test("select(lit(...))") { check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[Int]] _) check(prop[Option[String]] _) check(prop[Vector[Long]] _) check(prop[Vector[X1[Long]]] _) check(prop[Vector[String]] _) check(prop[Vector[X1[String]]] _) check(prop[X1[Int]] _) check(prop[X1[X1[Int]]] _) check(prop[Food] _) // doesn't work, object has to be serializable // check(prop[frameless.LocalDateTime] _) } test("support value class") { val initial = Seq( Q(name = new Name("Foo"), id = 1), Q(name = new Name("Bar"), id = 2)) val ds = TypedDataset.create(initial) ds.collect.run() shouldBe initial val lorem = new Name("Lorem") ds.withColumnReplaced('name, functions.litValue(lorem)). collect.run() shouldBe initial.map(_.copy(name = lorem)) } test("support optional value class") { val initial = Seq( R(name = "Foo", id = 1, alias = None), R(name = "Bar", id = 2, alias = Some(new Name("Lorem")))) val ds = TypedDataset.create(initial) ds.collect.run() shouldBe initial val someIpsum: Option[Name] = Some(new Name("Ipsum")) val lit = functions.litValue(someIpsum) val tds = ds.withColumnReplaced('alias, functions.litValue(someIpsum)) tds.queryExecution.toString() should include (lit.toString) tds. collect.run() shouldBe initial.map(_.copy(alias = someIpsum)) ds.withColumnReplaced('alias, functions.litValue(Option.empty[Name])). collect.run() shouldBe initial.map(_.copy(alias = None)) } test("#205: comparing literals encoded using Injection") { import org.apache.spark.sql.catalyst.util.DateTimeUtils implicit val dateAsInt: Injection[java.sql.Date, Int] = Injection(DateTimeUtils.fromJavaDate, DateTimeUtils.toJavaDate) val today = new java.sql.Date(System.currentTimeMillis) val data = Vector(P(42, today)) val tds = TypedDataset.create(data) tds.filter(tds('d) === today).collect.run().map(_.i) shouldBe Seq(42) } } final case class P(i: Int, d: java.sql.Date) final case class Q(id: Int, name: Name) final case class R(id: Int, name: String, alias: Option[Name]) ================================================ FILE: dataset/src/test/scala/frameless/NumericTests.scala ================================================ package frameless import org.apache.spark.sql.Encoder import org.scalacheck.{Arbitrary, Gen, Prop} import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers import scala.reflect.ClassTag class NumericTests extends TypedDatasetSuite with Matchers { test("plus") { def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[Numeric[A]].plus(a, b) val got = df.select(df.col('a) + df.col('b)).collect().run() got ?= (result :: Nil) } check(prop[BigDecimal] _) check(prop[Byte] _) check(prop[Double] _) check(prop[Int] _) check(prop[Long] _) check(prop[Short] _) } test("minus") { def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[Numeric[A]].minus(a, b) val got = df.select(df.col('a) - df.col('b)).collect().run() got ?= (result :: Nil) } check(prop[BigDecimal] _) check(prop[Byte] _) check(prop[Double] _) check(prop[Int] _) check(prop[Long] _) check(prop[Short] _) } test("multiply") { def prop[A: TypedEncoder : CatalystNumeric : Numeric : ClassTag](a: A, b: A): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = implicitly[Numeric[A]].times(a, b) val got = df.select(df.col('a) * df.col('b)).collect().run() got ?= (result :: Nil) } check(prop[Byte] _) check(prop[Double] _) check(prop[Int] _) check(prop[Long] _) check(prop[Short] _) } test("divide") { def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A)(implicit cd: CatalystDivisible[A, Double]): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) if (b == 0) proved else { val div: Double = implicitly[Numeric[A]].toDouble(a) / implicitly[Numeric[A]].toDouble(b) val got: Seq[Double] = df.select(df.col('a) / df.col('b)).collect().run() got ?= (div :: Nil) } } check(prop[Byte ] _) check(prop[Double] _) check(prop[Int ] _) check(prop[Long ] _) check(prop[Short ] _) } test("divide BigDecimals") { def prop(a: BigDecimal, b: BigDecimal): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) if (b.doubleValue == 0) proved else { // Spark performs something in between Double division and BigDecimal division, // we approximate it using double vision and `approximatelyEqual`: val div = BigDecimal(a.doubleValue / b.doubleValue) val got = df.select(df.col('a) / df.col('b)).collect().run() approximatelyEqual(got.head, div) } } check(prop _) } test("multiply BigDecimal") { def prop(a: BigDecimal, b: BigDecimal): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) val result = BigDecimal(a.doubleValue * b.doubleValue) val got = df.select(df.col('a) * df.col('b)).collect().run() approximatelyEqual(got.head, result) } check(prop _) } trait NumericMod[T] { def mod(a: T, b: T): T } object NumericMod { implicit val byteInstance = new NumericMod[Byte] { def mod(a: Byte, b: Byte) = (a % b).toByte } implicit val doubleInstance = new NumericMod[Double] { def mod(a: Double, b: Double) = a % b } implicit val floatInstance = new NumericMod[Float] { def mod(a: Float, b: Float) = a % b } implicit val intInstance = new NumericMod[Int] { def mod(a: Int, b: Int) = a % b } implicit val longInstance = new NumericMod[Long] { def mod(a: Long, b: Long) = a % b } implicit val shortInstance = new NumericMod[Short] { def mod(a: Short, b: Short) = (a % b).toShort } implicit val bigDecimalInstance = new NumericMod[BigDecimal] { def mod(a: BigDecimal, b: BigDecimal) = a % b } } test("mod") { import NumericMod._ def prop[A: TypedEncoder : CatalystNumeric : NumericMod](a: A, b: A): Prop = { val df = TypedDataset.create(X2(a, b) :: Nil) if (b == 0) proved else { val mod: A = implicitly[NumericMod[A]].mod(a, b) val got: Seq[A] = df.select(df.col('a) % df.col('b)).collect().run() got ?= (mod :: Nil) } } check(prop[Byte] _) check(prop[Double] _) check(prop[Int ] _) check(prop[Long ] _) check(prop[Short ] _) check(prop[BigDecimal] _) } test("a mod lit(b)"){ import NumericMod._ def prop[A: TypedEncoder : CatalystNumeric : NumericMod](elem: A, data: X1[A]): Prop = { val dataset = TypedDataset.create(Seq(data)) val a = dataset.col('a) if (elem == 0) proved else { val mod: A = implicitly[NumericMod[A]].mod(data.a, elem) val got: Seq[A] = dataset.select(a % elem).collect().run() got ?= (mod :: Nil) } } check(prop[Byte] _) check(prop[Double] _) check(prop[Int ] _) check(prop[Long ] _) check(prop[Short ] _) check(prop[BigDecimal] _) } test("isNaN") { val spark = session import spark.implicits._ implicit val doubleWithNaN = Arbitrary { implicitly[Arbitrary[Double]].arbitrary.flatMap(Gen.oneOf(_, Double.NaN)) } implicit val x1 = Arbitrary{ doubleWithNaN.arbitrary.map(X1(_)) } def prop[A : TypedEncoder : Encoder : CatalystNaN](data: List[X1[A]]): Prop = { val ds = TypedDataset.create(data) val expected = ds.toDF().filter(!$"a".isNaN).map(_.getAs[A](0)).collect().toSeq val rs = ds.filter(!ds('a).isNaN).collect().run().map(_.a) rs ?= expected } check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) } test("isNaN with non-nan types should not compile") { val ds = TypedDataset.create((1, false, 'a, "b") :: Nil) "ds.filter(ds('_1).isNaN)" shouldNot typeCheck "ds.filter(ds('_2).isNaN)" shouldNot typeCheck "ds.filter(ds('_3).isNaN)" shouldNot typeCheck "ds.filter(ds('_4).isNaN)" shouldNot typeCheck } } ================================================ FILE: dataset/src/test/scala/frameless/OrderByTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped import org.apache.spark.sql.Column import org.scalatest.matchers.should.Matchers class OrderByTests extends TypedDatasetSuite with Matchers { def sortings[A : CatalystOrdered, T]: Seq[(TypedColumn[T, A] => SortedTypedColumn[T, A], Column => Column)] = Seq( (_.desc, _.desc), (_.asc, _.asc), (t => t, t => t) //default ascending ) test("single column non nullable orderBy") { def prop[A: TypedEncoder : CatalystOrdered](data: Vector[X1[A]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X1[A]].map { case (typ, untyp) => ds.dataset.orderBy(untyp(ds.dataset.col("a"))).collect().toVector.?=( ds.orderBy(typ(ds('a))).collect().run().toVector) }.reduce(_ && _) } check(forAll(prop[Int] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Long] _)) check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[String] _)) } test("single column non nullable partition sorting") { def prop[A: TypedEncoder : CatalystOrdered](data: Vector[X1[A]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X1[A]].map { case (typ, untyp) => ds.dataset.sortWithinPartitions(untyp(ds.dataset.col("a"))).collect().toVector.?=( ds.sortWithinPartitions(typ(ds('a))).collect().run().toVector) }.reduce(_ && _) } check(forAll(prop[Int] _)) check(forAll(prop[Boolean] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Long] _)) check(forAll(prop[Float] _)) check(forAll(prop[Double] _)) check(forAll(prop[SQLDate] _)) check(forAll(prop[SQLTimestamp] _)) check(forAll(prop[String] _)) } test("two columns non nullable orderBy") { def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A,B]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X2[A, B]].reverse.zip(sortings[B, X2[A, B]]).map { case ((typA, untypA), (typB, untypB)) => val vanillaSpark = ds.dataset.orderBy(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b"))).collect().toVector vanillaSpark.?=(ds.orderBy(typA(ds('a)), typB(ds('b))).collect().run().toVector).&&( vanillaSpark ?= ds.orderByMany(typA(ds('a)), typB(ds('b))).collect().run().toVector ) }.reduce(_ && _) } check(forAll(prop[SQLDate, Long] _)) check(forAll(prop[String, Boolean] _)) check(forAll(prop[SQLTimestamp, Long] _)) } test("two columns non nullable partition sorting") { def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A,B]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X2[A, B]].reverse.zip(sortings[B, X2[A, B]]).map { case ((typA, untypA), (typB, untypB)) => val vanillaSpark = ds.dataset.sortWithinPartitions(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b"))).collect().toVector vanillaSpark.?=(ds.sortWithinPartitions(typA(ds('a)), typB(ds('b))).collect().run().toVector).&&( vanillaSpark ?= ds.sortWithinPartitionsMany(typA(ds('a)), typB(ds('b))).collect().run().toVector ) }.reduce(_ && _) } check(forAll(prop[SQLDate, Long] _)) check(forAll(prop[String, Boolean] _)) check(forAll(prop[SQLTimestamp, Long] _)) } test("three columns non nullable orderBy") { def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X3[A,B,A]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X3[A, B, A]].reverse .zip(sortings[B, X3[A, B, A]]) .zip(sortings[A, X3[A, B, A]]) .map { case (((typA, untypA), (typB, untypB)), (typA2, untypA2)) => val vanillaSpark = ds.dataset .orderBy(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b")), untypA2(ds.dataset.col("c"))) .collect().toVector vanillaSpark.?=(ds.orderBy(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector).&&( vanillaSpark ?= ds.orderByMany(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector ) }.reduce(_ && _) } check(forAll(prop[SQLDate, Long] _)) check(forAll(prop[String, Boolean] _)) check(forAll(prop[SQLTimestamp, Long] _)) } test("three columns non nullable partition sorting") { def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X3[A,B,A]]): Prop = { val ds = TypedDataset.create(data) sortings[A, X3[A, B, A]].reverse .zip(sortings[B, X3[A, B, A]]) .zip(sortings[A, X3[A, B, A]]) .map { case (((typA, untypA), (typB, untypB)), (typA2, untypA2)) => val vanillaSpark = ds.dataset .sortWithinPartitions(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b")), untypA2(ds.dataset.col("c"))) .collect().toVector vanillaSpark.?=(ds.sortWithinPartitions(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector).&&( vanillaSpark ?= ds.sortWithinPartitionsMany(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector ) }.reduce(_ && _) } check(forAll(prop[SQLDate, Long] _)) check(forAll(prop[String, Boolean] _)) check(forAll(prop[SQLTimestamp, Long] _)) } test("sort support for mixed default and explicit ordering") { def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A, B]]): Prop = { val ds = TypedDataset.create(data) ds.dataset.orderBy(ds.dataset.col("a"), ds.dataset.col("b").desc).collect().toVector.?=( ds.orderByMany(ds('a), ds('b).desc).collect().run().toVector) && ds.dataset.sortWithinPartitions(ds.dataset.col("a"), ds.dataset.col("b").desc).collect().toVector.?=( ds.sortWithinPartitionsMany(ds('a), ds('b).desc).collect().run().toVector) } check(forAll(prop[SQLDate, Long] _)) check(forAll(prop[String, Boolean] _)) check(forAll(prop[SQLTimestamp, Long] _)) } test("fail when selected column is not sortable") { val d = TypedDataset.create(X2(1, Map(1 -> 2)) :: X2(2, Map(2 -> 2)) :: Nil) d.orderBy(d('a).desc) illTyped("""d.orderBy(d('b).desc)""") illTyped("""d.sortWithinPartitions(d('b).desc)""") } test("derives a CatalystOrdered for case classes when all fields are comparable") { type T[A, B] = X3[Int, Boolean, X2[A, B]] def prop[ A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered ](data: Vector[T[A, B]]): Prop = { val ds = TypedDataset.create(data) sortings[X2[A, B], T[A, B]].map { case (typX2, untypX2) => val vanilla = ds.dataset.orderBy(untypX2(ds.dataset.col("c"))).collect().toVector val frameless = ds.orderBy(typX2(ds('c))).collect().run.toVector vanilla ?= frameless }.reduce(_ && _) } check(forAll(prop[Int, Long] _)) check(forAll(prop[(String, SQLDate), Float] _)) // Check that nested case classes are properly derived too check(forAll(prop[X2[Boolean, Float], X4[SQLTimestamp, Double, Short, Byte]] _)) } test("derives a CatalystOrdered for tuples when all fields are comparable") { type T[A, B] = X2[Int, (A, B)] def prop[ A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered ](data: Vector[T[A, B]]): Prop = { val ds = TypedDataset.create(data) sortings[(A, B), T[A, B]].map { case (typX2, untypX2) => val vanilla = ds.dataset.orderBy(untypX2(ds.dataset.col("b"))).collect().toVector val frameless = ds.orderBy(typX2(ds('b))).collect().run.toVector vanilla ?= frameless }.reduce(_ && _) } check(forAll(prop[Int, Long] _)) check(forAll(prop[(String, SQLDate), Float] _)) check(forAll(prop[X2[Boolean, Float], X1[(SQLTimestamp, Double, Short, Byte)]] _)) } test("fails to compile when one of the field isn't comparable") { type T = X2[Int, X2[Int, Map[String, String]]] val d = TypedDataset.create(X2(1, X2(2, Map("not" -> "comparable"))) :: Nil) illTyped("d.orderBy(d('b).desc)", """Cannot compare columns of type frameless.X2\[Int,scala.collection.immutable.Map\[String,String]].""") } } ================================================ FILE: dataset/src/test/scala/frameless/RecordEncoderTests.scala ================================================ package frameless import org.apache.spark.sql.{Row, functions => F} import org.apache.spark.sql.types.{ ArrayType, BinaryType, DecimalType, IntegerType, LongType, MapType, ObjectType, StringType, StructField, StructType } import shapeless.{HList, LabelledGeneric} import shapeless.test.illTyped import org.scalatest.matchers.should.Matchers final class RecordEncoderTests extends TypedDatasetSuite with Matchers { test("Unable to encode products made from units only") { illTyped("TypedEncoder[UnitsOnly]") } test("Dropping fields") { def dropUnitValues[L <: HList](l: L)(implicit d: DropUnitValues[L]): d.Out = d(l) val fields = LabelledGeneric[TupleWithUnits].to(TupleWithUnits(42, "something")) dropUnitValues(fields) shouldEqual LabelledGeneric[(Int, String)].to((42, "something")) } test("Representation skips units") { assert(TypedEncoder[(Int, String)].catalystRepr == TypedEncoder[TupleWithUnits].catalystRepr) } test("Serialization skips units") { val df = session.createDataFrame(Seq((1, "one"), (2, "two"))) val ds = df.as[TupleWithUnits](TypedExpressionEncoder[TupleWithUnits]) val tds = TypedDataset.create(Seq(TupleWithUnits(1, "one"), TupleWithUnits(2, "two"))) df.collect shouldEqual tds.toDF.collect ds.collect.toSeq shouldEqual tds.collect.run } test("Empty nested record value becomes null on serialization") { val ds = TypedDataset.create(Seq(OptionalNesting(Option.empty))) val df = ds.toDF df.na.drop.count shouldBe 0 } test("Empty nested record value becomes none on deserialization") { val rdd = sc.parallelize(Seq(Row(null))) val schema = TypedEncoder[OptionalNesting].catalystRepr.asInstanceOf[StructType] val df = session.createDataFrame(rdd, schema) val ds = TypedDataset.createUnsafe(df)(TypedEncoder[OptionalNesting]) ds.firstOption.run.get.o.isEmpty shouldBe true } test("Deeply nested optional values have correct deserialization") { val rdd = sc.parallelize(Seq(Row(true, Row(null, null)))) type NestedOptionPair = X2[Boolean, Option[X2[Option[Int], Option[String]]]] val schema = TypedEncoder[NestedOptionPair].catalystRepr.asInstanceOf[StructType] val df = session.createDataFrame(rdd, schema) val ds = TypedDataset.createUnsafe(df)(TypedEncoder[NestedOptionPair]) ds.firstOption.run.get shouldBe X2(true, Some(X2(None, None))) } test("Nesting with Seq") { import RecordEncoderTests._ val obj = C(B(Seq(A(1)))) val rdd = sc.parallelize(Seq(obj)) val ds = session.createDataset(rdd)(TypedExpressionEncoder[C]) ds.collect.head shouldBe obj } test("Nesting with Set") { import RecordEncoderTests._ val obj = E(Set(B(Seq(A(1))))) val rdd = sc.parallelize(Seq(obj)) val ds = session.createDataset(rdd)(TypedExpressionEncoder[E]) ds.collect.head shouldBe obj } test("Scalar value class") { import RecordEncoderTests._ val encoder = TypedEncoder[Name] encoder.jvmRepr shouldBe ObjectType(classOf[Name]) encoder.catalystRepr shouldBe StructType( Seq(StructField("value", StringType, false))) val sqlContext = session.sqlContext import sqlContext.implicits._ TypedDataset .createUnsafe[Name](Seq("Foo", "Bar").toDF)(encoder) .collect().run() shouldBe Seq(new Name("Foo"), new Name("Bar")) } test("Case class with value class field") { import RecordEncoderTests._ illTyped( // As `Person` is not a Value class "val _: RecordFieldEncoder[Person] = RecordFieldEncoder.valueClass") val fieldEncoder: RecordFieldEncoder[Name] = RecordFieldEncoder.valueClass fieldEncoder.encoder.catalystRepr shouldBe StringType fieldEncoder.encoder.jvmRepr shouldBe ObjectType(classOf[String]) // Encode as a Person field val encoder = TypedEncoder[Person] encoder.jvmRepr shouldBe ObjectType(classOf[Person]) val expectedPersonStructType = StructType(Seq( StructField("name", StringType, false), StructField("age", IntegerType, false))) encoder.catalystRepr shouldBe expectedPersonStructType val unsafeDs: TypedDataset[Person] = { val rdd = sc.parallelize(Seq( Row.fromTuple("Foo" -> 2), Row.fromTuple("Bar" -> 3) )) val df = session.createDataFrame(rdd, expectedPersonStructType) TypedDataset.createUnsafe(df)(encoder) } val expected = Seq( Person(new Name("Foo"), 2), Person(new Name("Bar"), 3)) unsafeDs.collect.run() shouldBe expected // Safely created DS val safeDs = TypedDataset.create(expected) safeDs.collect.run() shouldBe expected val lorem = new Name("Lorem") safeDs.withColumnReplaced('name, functions.litValue(lorem)). collect.run() shouldBe expected.map(_.copy(name = lorem)) } test("Case class with value class as optional field") { import RecordEncoderTests._ illTyped( // As `Person` is not a Value class """val _: RecordFieldEncoder[Option[Person]] = RecordFieldEncoder.optionValueClass""") val fieldEncoder: RecordFieldEncoder[Option[Name]] = RecordFieldEncoder.optionValueClass fieldEncoder.encoder.catalystRepr shouldBe StringType fieldEncoder.encoder. // !StringType jvmRepr shouldBe ObjectType(classOf[Option[_]]) // Encode as a Person field val encoder = TypedEncoder[User] encoder.jvmRepr shouldBe ObjectType(classOf[User]) val expectedPersonStructType = StructType(Seq( StructField("id", LongType, false), StructField("name", StringType, true))) encoder.catalystRepr shouldBe expectedPersonStructType val ds1: TypedDataset[User] = { val rdd = sc.parallelize(Seq( Row(1L, null), Row(2L, "Foo") )) val df = session.createDataFrame(rdd, expectedPersonStructType) TypedDataset.createUnsafe(df)(encoder) } ds1.collect.run() shouldBe Seq( User(1L, None), User(2L, Some(new Name("Foo")))) val ds2: TypedDataset[User] = { val sqlContext = session.sqlContext import sqlContext.implicits._ val df1 = Seq( """{"id":3,"label":"unused"}""", """{"id":4,"name":"Lorem"}""", """{"id":5,"name":null}""" ).toDF val df2 = df1.withColumn( "jsonValue", F.from_json(df1.col("value"), expectedPersonStructType)). select("jsonValue.id", "jsonValue.name") TypedDataset.createUnsafe[User](df2) } val expected = Seq( User(3L, None), User(4L, Some(new Name("Lorem"))), User(5L, None)) ds2.collect.run() shouldBe expected // Safely created ds TypedDataset.create(expected).collect.run() shouldBe expected } test("Case class with simple Map") { import RecordEncoderTests._ val encoder = TypedEncoder[D] encoder.jvmRepr shouldBe ObjectType(classOf[D]) val expectedStructType = StructType(Seq( StructField("m", MapType( keyType = StringType, valueType = IntegerType, valueContainsNull = false), false))) encoder.catalystRepr shouldBe expectedStructType val sqlContext = session.sqlContext import sqlContext.implicits._ val ds1 = TypedDataset.createUnsafe[D] { val df = Seq( """{"m":{"pizza":1,"sushi":2}}""", """{"m":{"red":3,"blue":4}}""", ).toDF df.withColumn( "jsonValue", F.from_json(df.col("value"), expectedStructType)). select("jsonValue.*") } val expected = Seq( D(m = Map("pizza" -> 1, "sushi" -> 2)), D(m = Map("red" -> 3, "blue" -> 4))) ds1.collect.run() shouldBe expected val m2 = Map("updated" -> 5) val ds2 = ds1.withColumnReplaced('m, functions.lit(m2)) ds2.collect.run() shouldBe expected.map(_.copy(m = m2)) } test("Case class with Map & Value class") { import RecordEncoderTests._ val encoder = TypedEncoder[Student] encoder.jvmRepr shouldBe ObjectType(classOf[Student]) val expectedStudentStructType = StructType(Seq( StructField("name", StringType, false), StructField("grades", MapType( keyType = StringType, valueType = DecimalType.SYSTEM_DEFAULT, valueContainsNull = false), false))) encoder.catalystRepr shouldBe expectedStudentStructType val sqlContext = session.sqlContext import sqlContext.implicits._ val ds1 = TypedDataset.createUnsafe[Student] { val df = Seq( """{"name":"Foo","grades":{"math":1,"physics":"23.4"}}""", """{"name":"Bar","grades":{"biology":18.5,"geography":4}}""", ).toDF df.withColumn( "jsonValue", F.from_json(df.col("value"), expectedStudentStructType)). select("jsonValue.*") } val expected = Seq( Student(name = "Foo", grades = Map( new Subject("math") -> new Grade(BigDecimal(1)), new Subject("physics") -> new Grade(BigDecimal(23.4D)))), Student(name = "Bar", grades = Map( new Subject("biology") -> new Grade(BigDecimal(18.5)), new Subject("geography") -> new Grade(BigDecimal(4L))))) ds1.collect.run() shouldBe expected val grades = Map[Subject, Grade]( new Subject("any") -> new Grade(BigDecimal(Long.MaxValue) + 1L)) val ds2 = ds1.withColumnReplaced('grades, functions.lit(grades)) ds2.collect.run() shouldBe Seq( Student("Foo", grades), Student("Bar", grades)) } test("Encode binary array") { val encoder = TypedEncoder[Tuple2[String, Array[Byte]]] encoder.jvmRepr shouldBe ObjectType( classOf[Tuple2[String, Array[Byte]]]) val expectedStructType = StructType(Seq( StructField("_1", StringType, false), StructField("_2", BinaryType, false))) encoder.catalystRepr shouldBe expectedStructType val ds1: TypedDataset[(String, Array[Byte])] = { val rdd = sc.parallelize(Seq( Row.fromTuple("Foo" -> Array[Byte](3, 4)), Row.fromTuple("Bar" -> Array[Byte](5)) )) val df = session.createDataFrame(rdd, expectedStructType) TypedDataset.createUnsafe(df)(encoder) } val expected = Seq("Foo" -> Seq[Byte](3, 4), "Bar" -> Seq[Byte](5)) ds1.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected val subjects = "lorem".getBytes("UTF-8").toSeq val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray)) ds2.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected.map(_.copy(_2 = subjects)) } test("Encode simple array") { val encoder = TypedEncoder[Tuple2[String, Array[Int]]] encoder.jvmRepr shouldBe ObjectType( classOf[Tuple2[String, Array[Int]]]) val expectedStructType = StructType(Seq( StructField("_1", StringType, false), StructField("_2", ArrayType(IntegerType, false), false))) encoder.catalystRepr shouldBe expectedStructType val sqlContext = session.sqlContext import sqlContext.implicits._ val ds1 = TypedDataset.createUnsafe[(String, Array[Int])] { val df = Seq( """{"_1":"Foo", "_2":[3, 4]}""", """{"_1":"Bar", "_2":[5]}""", ).toDF df.withColumn( "jsonValue", F.from_json(df.col("value"), expectedStructType)). select("jsonValue.*") } val expected = Seq("Foo" -> Seq(3, 4), "Bar" -> Seq(5)) ds1.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected val subjects = Seq(6, 6, 7) val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray)) ds2.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected.map(_.copy(_2 = subjects)) } test("Encode array of Value class") { import RecordEncoderTests._ val encoder = TypedEncoder[Tuple2[String, Array[Subject]]] encoder.jvmRepr shouldBe ObjectType( classOf[Tuple2[String, Array[Subject]]]) val expectedStructType = StructType(Seq( StructField("_1", StringType, false), StructField("_2", ArrayType(StringType, false), false))) encoder.catalystRepr shouldBe expectedStructType val sqlContext = session.sqlContext import sqlContext.implicits._ val ds1 = TypedDataset.createUnsafe[(String, Array[Subject])] { val df = Seq( """{"_1":"Foo", "_2":["math","physics"]}""", """{"_1":"Bar", "_2":["biology","geography"]}""", ).toDF df.withColumn( "jsonValue", F.from_json(df.col("value"), expectedStructType)). select("jsonValue.*") } val expected = Seq( "Foo" -> Seq(new Subject("math"), new Subject("physics")), "Bar" -> Seq(new Subject("biology"), new Subject("geography"))) ds1.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected val subjects = Seq(new Subject("lorem"), new Subject("ipsum")) val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray)) ds2.collect.run().map { case (_1, _2) => _1 -> _2.toSeq } shouldBe expected.map(_.copy(_2 = subjects)) } test("Encode case class with simple Seq") { import RecordEncoderTests._ val encoder = TypedEncoder[B] encoder.jvmRepr shouldBe ObjectType(classOf[B]) val expectedStructType = StructType(Seq( StructField("a", ArrayType(StructType(Seq( StructField("x", IntegerType, false))), false), false))) encoder.catalystRepr shouldBe expectedStructType val ds1: TypedDataset[B] = { val rdd = sc.parallelize(Seq( Row.fromTuple(Tuple1(Seq( Row.fromTuple(Tuple1[Int](1)), Row.fromTuple(Tuple1[Int](3)) ))), Row.fromTuple(Tuple1(Seq( Row.fromTuple(Tuple1[Int](2)) ))) )) val df = session.createDataFrame(rdd, expectedStructType) TypedDataset.createUnsafe(df)(encoder) } val expected = Seq(B(Seq(A(1), A(3))), B(Seq(A(2)))) ds1.collect.run() shouldBe expected val as = Seq(A(5), A(6)) val ds2 = ds1.withColumnReplaced('a, functions.lit(as)) ds2.collect.run() shouldBe expected.map(_.copy(a = as)) } test("Encode case class with Value class") { import RecordEncoderTests._ val encoder = TypedEncoder[Tuple2[Int, Seq[Name]]] encoder.jvmRepr shouldBe ObjectType(classOf[Tuple2[Int, Seq[Name]]]) val expectedStructType = StructType(Seq( StructField("_1", IntegerType, false), StructField("_2", ArrayType(StringType, false), false))) encoder.catalystRepr shouldBe expectedStructType val ds1 = TypedDataset.createUnsafe[(Int, Seq[Name])] { val sqlContext = session.sqlContext import sqlContext.implicits._ val df = Seq( """{"_1":1, "_2":["foo", "bar"]}""", """{"_1":2, "_2":["lorem"]}""", ).toDF df.withColumn( "jsonValue", F.from_json(df.col("value"), expectedStructType)). select("jsonValue.*") } val expected = Seq( 1 -> Seq(new Name("foo"), new Name("bar")), 2 -> Seq(new Name("lorem"))) ds1.collect.run() shouldBe expected } } // --- case class UnitsOnly(a: Unit, b: Unit) case class TupleWithUnits( u0: Unit, _1: Int, u1: Unit, u2: Unit, _2: String, u3: Unit) object TupleWithUnits { def apply(_1: Int, _2: String): TupleWithUnits = TupleWithUnits((), _1, (), (), _2, ()) } case class OptionalNesting(o: Option[TupleWithUnits]) object RecordEncoderTests { case class A(x: Int) case class B(a: Seq[A]) case class C(b: B) class Name(val value: String) extends AnyVal with Serializable { override def toString = s"Name($value)" } case class Person(name: Name, age: Int) case class User(id: Long, name: Option[Name]) case class D(m: Map[String, Int]) case class E(b: Set[B]) final class Subject(val name: String) extends AnyVal with Serializable final class Grade(val value: BigDecimal) extends AnyVal with Serializable case class Student(name: String, grades: Map[Subject, Grade]) } ================================================ FILE: dataset/src/test/scala/frameless/SchemaTests.scala ================================================ package frameless import frameless.functions.aggregate._ import frameless.functions._ import org.apache.spark.sql.types.StructType import org.scalacheck.Prop import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers class SchemaTests extends TypedDatasetSuite with Matchers { def structToNonNullable(struct: StructType): StructType = { StructType(struct.fields.map( f => f.copy(nullable = false))) } def prop[A](dataset: TypedDataset[A], ignoreNullable: Boolean = false): Prop = { val schema = dataset.dataset.schema Prop.all( if (!ignoreNullable) dataset.schema ?= schema else structToNonNullable(dataset.schema) ?= structToNonNullable(schema), if (!ignoreNullable) TypedExpressionEncoder.targetStructType(dataset.encoder) ?= schema else structToNonNullable(TypedExpressionEncoder.targetStructType(dataset.encoder)) ?= structToNonNullable(schema) ) } test("schema of groupBy('a).agg(sum('b))") { val df0 = TypedDataset.create(X2(1L, 1L) :: Nil) val _a = df0.col('a) val _b = df0.col('b) val df = df0.groupBy(_a).agg(sum(_b)) check(prop(df, true)) } test("schema of select(lit(1L))") { val df0 = TypedDataset.create("test" :: Nil) val df = df0.select(lit(1L)) check(prop(df)) } test("schema of select(lit(1L), lit(2L)).as[X2[Long, Long]]") { val df0 = TypedDataset.create("test" :: Nil) val df = df0.select(lit(1L), lit(2L)).as[X2[Long, Long]] check(prop(df)) } } ================================================ FILE: dataset/src/test/scala/frameless/SelectTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped import scala.reflect.ClassTag class SelectTests extends TypedDatasetSuite { test("select('a) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val dataset2 = dataset.select(A).collect().run().toVector val data2 = data.map { case X4(a, _, _, _) => a } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[X2[Int, Int], Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[UdtEncodedClass, Int, Int, Int] _)) } test("select('a, 'b) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], eab: TypedEncoder[(A, B)], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val dataset2 = dataset.select(A, B).collect().run().toVector val data2 = data.map { case X4(a, b, _, _) => (a, b) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, String, Int, Int] _)) } test("select('a, 'b, 'c) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], eab: TypedEncoder[(A, B, C)], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val dataset2 = dataset.select(A, B, C).collect().run().toVector val data2 = data.map { case X4(a, b, c, _) => (a, b, c) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, String, Int, Int] _)) } test("select('a,'b,'c,'d) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a, 'c) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a,'c,'b) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a,'c,'b, 'a) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a,'c,'b,'a,'c) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1, a3).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a, c) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a,'b,'c,'d,'a,'c,'b,'a,'c, 'd) FROM abcd") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], eb: TypedEncoder[B], ec: TypedEncoder[C], ed: TypedEncoder[D], ex4: TypedEncoder[X4[A, B, C, D]], ca: ClassTag[A] ): Prop = { val dataset = TypedDataset.create(data) val a1 = dataset.col[A]('a) val a2 = dataset.col[B]('b) val a3 = dataset.col[C]('c) val a4 = dataset.col[D]('d) val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1, a3, a4).collect().run().toVector val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a, c, d) } dataset2 ?= data2 } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[String, Boolean, Int, Float] _)) } test("select('a.b)") { def prop[A, B, C](data: Vector[X2[X2[A, B], C]])( implicit eabc: TypedEncoder[X2[X2[A, B], C]], eb: TypedEncoder[B], cb: ClassTag[B] ): Prop = { val dataset = TypedDataset.create(data) val AB = dataset.colMany('a, 'b) val dataset2 = dataset.select(AB).collect().run().toVector val data2 = data.map { case X2(X2(_, b), _) => b } dataset2 ?= data2 } check(forAll(prop[Int, String, Double] _)) } test("select with column expression addition") { def prop[A](data: Vector[X1[A]], const: A)( implicit eabc: TypedEncoder[X1[A]], anum: CatalystNumeric[A], num: Numeric[A], eb: TypedEncoder[A] ): Prop = { val ds = TypedDataset.create(data) val dataset2 = ds.select(ds('a) + const).collect().run().toVector val data2 = data.map { case X1(a) => num.plus(a, const) } dataset2 ?= data2 } check(forAll(prop[Short] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) } test("select with column expression multiplication") { def prop[A](data: Vector[X1[A]], const: A)( implicit eabc: TypedEncoder[X1[A]], anum: CatalystNumeric[A], num: Numeric[A], eb: TypedEncoder[A] ): Prop = { val ds = TypedDataset.create(data) val dataset2 = ds.select(ds('a) * const).collect().run().toVector val data2 = data.map { case X1(a) => num.times(a, const) } dataset2 ?= data2 } check(forAll(prop[Short] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) } test("select with column expression subtraction") { def prop[A](data: Vector[X1[A]], const: A)( implicit eabc: TypedEncoder[X1[A]], cnum: CatalystNumeric[A], num: Numeric[A], eb: TypedEncoder[A] ): Prop = { val ds = TypedDataset.create(data) val dataset2 = ds.select(ds('a) - const).collect().run().toVector val data2 = data.map { case X1(a) => num.minus(a, const) } dataset2 ?= data2 } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) } test("select with column expression division") { def prop[A](data: Vector[X1[A]], const: A)( implicit eabc: TypedEncoder[X1[A]], anum: CatalystNumeric[A], frac: Fractional[A], eb: TypedEncoder[A] ): Prop = { val ds = TypedDataset.create(data) if (const != 0) { val dataset2 = ds.select(ds('a) / const).collect().run().toVector.asInstanceOf[Vector[A]] val data2 = data.map { case X1(a) => frac.div(a, const) } dataset2 ?= data2 } else 0 ?= 0 } check(forAll(prop[Double] _)) } test("tests to cover problematic dataframe column names during projections") { case class Foo(i: Int) val e = TypedDataset.create[Foo](Foo(1) :: Nil) val t: TypedDataset[(Int, Int)] = e.select(e.col('i) * 2, e.col('i)) assert(t.select(t.col('_1)).collect().run().toList === List(2)) // Issue #54 val fooT = t.select(t.col('_1)).deserialized.map(x => Tuple1.apply(x)).as[Foo] assert(fooT.select(fooT('i)).collect().run().toList === List(2)) } test("unary - on arithmetic") { val e = TypedDataset.create[(Int, String, Int)]((1, "a", 2) :: (2, "b", 4) :: (2, "b", 1) :: Nil) assert(e.select(-e('_1)).collect().run().toVector === Vector(-1, -2, -2)) assert(e.select(-(e('_1) + e('_3))).collect().run().toVector === Vector(-3, -6, -3)) } test("unary - on strings should not type check") { val e = TypedDataset.create[(Int, String, Long)]((1, "a", 2L) :: (2, "b", 4L) :: (2, "b", 1L) :: Nil) illTyped("""e.select( -e('_2) )""") } test("select with aggregation operations is not supported") { val e = TypedDataset.create[(Int, String, Long)]((1, "a", 2L) :: (2, "b", 4L) :: (2, "b", 1L) :: Nil) illTyped("""e.select(frameless.functions.aggregate.sum(e('_1)))""") } } ================================================ FILE: dataset/src/test/scala/frameless/SelfJoinTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import org.apache.spark.sql.{SparkSession, functions => sparkFunctions} class SelfJoinTests extends TypedDatasetSuite { // Without crossJoin.enabled=true Spark doesn't like trivial join conditions: // [error] Join condition is missing or trivial. // [error] Use the CROSS JOIN syntax to allow cartesian products between these relations. def allowTrivialJoin[T](body: => T)(implicit session: SparkSession): T = { val crossJoin = "spark.sql.crossJoin.enabled" val oldSetting = session.conf.get(crossJoin) session.conf.set(crossJoin, "true") val result = body session.conf.set(crossJoin, oldSetting) result } def allowAmbiguousJoin[T](body: => T)(implicit session: SparkSession): T = { val crossJoin = "spark.sql.analyzer.failAmbiguousSelfJoin" val oldSetting = session.conf.get(crossJoin) session.conf.set(crossJoin, "false") val result = body session.conf.set(crossJoin, oldSetting) result } test("self join with colLeft/colRight disambiguation") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering ](dx: List[X2[A, B]], d: X2[A, B]): Prop = allowAmbiguousJoin { val data = d :: dx val ds = TypedDataset.create(data) // This is the way to write unambiguous self-join in vanilla, see https://goo.gl/XnkSUD val df1 = ds.dataset.as("df1") val df2 = ds.dataset.as("df2") val vanilla = df1.join(df2, sparkFunctions.col("df1.a") === sparkFunctions.col("df2.a")).count() val typed = ds.joinInner(ds)( ds.colLeft('a) === ds.colRight('a) ).count().run() vanilla ?= typed } check(prop[Int, Int] _) } test("trivial self join") { def prop[ A : TypedEncoder : Ordering, B : TypedEncoder : Ordering ](dx: List[X2[A, B]], d: X2[A, B]): Prop = allowTrivialJoin { allowAmbiguousJoin { val data = d :: dx val ds = TypedDataset.create(data) val untyped = ds.dataset // Interestingly, even with aliasing it seems that it's impossible to // obtain a trivial join condition of shape df1.a == df1.a, Spark we // always interpret that as df1.a == df2.a. For the purpose of this // test we fall-back to lit(true) instead. // val trivial = sparkFunctions.col("df1.a") === sparkFunctions.col("df1.a") val trivial = sparkFunctions.lit(true) val vanilla = untyped.as("df1").join(untyped.as("df2"), trivial).count() val typed = ds.joinInner(ds)(ds.colLeft('a) === ds.colLeft('a)).count().run vanilla ?= typed } } check(prop[Int, Int] _) } test("self join with unambiguous expression") { def prop[ A : TypedEncoder : CatalystNumeric : Ordering, B : TypedEncoder : Ordering ](data: List[X3[A, A, B]]): Prop = allowAmbiguousJoin { val ds = TypedDataset.create(data) val df1 = ds.dataset.alias("df1") val df2 = ds.dataset.alias("df2") val vanilla = df1.join(df2, (sparkFunctions.col("df1.a") + sparkFunctions.col("df1.b")) === (sparkFunctions.col("df2.a") + sparkFunctions.col("df2.b"))).count() val typed = ds.joinInner(ds)( (ds.colLeft('a) + ds.colLeft('b)) === (ds.colRight('a) + ds.colRight('b)) ).count().run() vanilla ?= typed } check(prop[Int, Int] _) } test("Do you want ambiguous self join? This is how you get ambiguous self join.") { def prop[ A : TypedEncoder : CatalystNumeric : Ordering, B : TypedEncoder : Ordering ](data: List[X3[A, A, B]]): Prop = allowTrivialJoin { allowAmbiguousJoin { val ds = TypedDataset.create(data) // The point I'm making here is that it "behaves just like Spark". I // don't know (or really care about how) how Spark disambiguates that // internally... val vanilla = ds.dataset.join(ds.dataset, (ds.dataset("a") + ds.dataset("b")) === (ds.dataset("a") + ds.dataset("b"))).count() val typed = ds.joinInner(ds)( (ds.col('a) + ds.col('b)) === (ds.col('a) + ds.col('b)) ).count().run() vanilla ?= typed } } check(prop[Int, Int] _) } test("colLeft and colRight are equivalent to col outside of joins") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], ex4: TypedEncoder[X4[A, B, C, D]] ): Prop = { val dataset = TypedDataset.create(data) val selectedCol = dataset.select(dataset.col [A]('a)).collect().run().toVector val selectedColLeft = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight) } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[X2[Int, Int], Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[UdtEncodedClass, Int, Int, Int] _)) } test("colLeft and colRight are equivalent to col outside of joins - via files (codegen)") { def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])( implicit ea: TypedEncoder[A], ex4: TypedEncoder[X4[A, B, C, D]] ): Prop = { TypedDataset.create(data).write.mode("overwrite").parquet("./target/testData") val dataset = TypedDataset.createUnsafe[X4[A, B, C, D]](session.read.parquet("./target/testData")) val selectedCol = dataset.select(dataset.col [A]('a)).collect().run().toVector val selectedColLeft = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight) } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[X2[Int, Int], Int, Int, Int] _)) check(forAll(prop[String, Int, Int, Int] _)) check(forAll(prop[UdtEncodedClass, Int, Int, Int] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/TypedDatasetSuite.scala ================================================ package frameless import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem import org.apache.hadoop.fs.local.StreamingFS import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SparkSession} import org.scalactic.anyvals.PosZInt import org.scalatest.BeforeAndAfterAll import org.scalatestplus.scalacheck.Checkers import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.util.{Properties, Try} import org.scalatest.funsuite.AnyFunSuite trait SparkTesting { self: BeforeAndAfterAll => val appID: String = new java.util.Date().toString + math.floor(math.random * 10E4).toLong.toString /** * Allows bare naked to be used instead of winutils for testing / dev */ def registerFS(sparkConf: SparkConf): SparkConf = { if (System.getProperty("os.name").startsWith("Windows")) sparkConf.set("spark.hadoop.fs.file.impl", classOf[BareLocalFileSystem].getName). set("spark.hadoop.fs.AbstractFileSystem.file.impl", classOf[StreamingFS].getName) else sparkConf } val conf: SparkConf = registerFS(new SparkConf()) .setMaster("local[*]") .setAppName("test") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) private var s: SparkSession = _ implicit def session: SparkSession = s implicit def sc: SparkContext = session.sparkContext implicit def sqlContext: SQLContext = session.sqlContext def registerOptimizations(sqlContext: SQLContext): Unit = { } def addSparkConfigProperties(config: SparkConf): Unit = { } override def beforeAll(): Unit = { assert(s == null) addSparkConfigProperties(conf) s = SparkSession.builder().config(conf).getOrCreate() registerOptimizations(sqlContext) } override def afterAll(): Unit = { if (s != null) { s.stop() s = null } } } class TypedDatasetSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll with SparkTesting { // Limit size of generated collections and number of checks to avoid OutOfMemoryError implicit override val generatorDrivenConfig: PropertyCheckConfiguration = { def getPosZInt(name: String, default: PosZInt) = Properties.envOrNone(s"FRAMELESS_GEN_${name}") .flatMap(s => Try(s.toInt).toOption) .flatMap(PosZInt.from) .getOrElse(default) PropertyCheckConfiguration( sizeRange = getPosZInt("SIZE_RANGE", PosZInt(20)), minSize = getPosZInt("MIN_SIZE", PosZInt(0)) ) } implicit val sparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob def approximatelyEqual[A](a: A, b: A)(implicit numeric: Numeric[A]): Prop = { val da = numeric.toDouble(a) val db = numeric.toDouble(b) val epsilon = 1E-6 // Spark has a weird behaviour concerning expressions that should return Inf // Most of the time they return NaN instead, for instance stddev of Seq(-7.827553978923477E227, -5.009124275715786E153) if((da.isNaN || da.isInfinity) && (db.isNaN || db.isInfinity)) proved else if ( (da - db).abs < epsilon || (da - db).abs < da.abs / 100) proved else falsified :| s"Expected $a but got $b, which is more than 1% off and greater than epsilon = $epsilon." } } ================================================ FILE: dataset/src/test/scala/frameless/UdtEncodedClass.scala ================================================ package frameless import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ import org.apache.spark.sql.FramelessInternals.UserDefinedType @SQLUserDefinedType(udt = classOf[UdtEncodedClassUdt]) class UdtEncodedClass(val a: Int, val b: Array[Double]) { override def equals(other: Any): Boolean = other match { case that: UdtEncodedClass => a == that.a && java.util.Arrays.equals(b, that.b) case _ => false } override def hashCode(): Int = { val state = Seq[Any](a, b) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } override def toString = s"UdtEncodedClass($a, $b)" } object UdtEncodedClass { implicit val udtForUdtEncodedClass = new UdtEncodedClassUdt } class UdtEncodedClassUdt extends UserDefinedType[UdtEncodedClass] { def sqlType: DataType = { StructType(Seq( StructField("a", IntegerType, nullable = false), StructField("b", ArrayType(DoubleType, containsNull = false), nullable = false) )) } def serialize(obj: UdtEncodedClass): InternalRow = { val row = new GenericInternalRow(3) row.setInt(0, obj.a) row.update(1, UnsafeArrayData.fromPrimitiveArray(obj.b)) row } def deserialize(datum: Any): UdtEncodedClass = datum match { case row: InternalRow => new UdtEncodedClass(row.getInt(0), row.getArray(1).toDoubleArray()) } def userClass: Class[UdtEncodedClass] = classOf[UdtEncodedClass] } ================================================ FILE: dataset/src/test/scala/frameless/WithColumnTest.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped class WithColumnTest extends TypedDatasetSuite { import WithColumnTest._ test("fail to compile on missing value") { val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) illTyped { """val fNew: TypedDataset[XMissing] = f.withColumn[XMissing](f('j) === 10)""" } } test("fail to compile on different column name") { val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) illTyped { """val fNew: TypedDataset[XDifferentColumnName] = f.withColumn[XDifferentColumnName](f('j) === 10)""" } } test("fail to compile on added column name") { val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) illTyped { """val fNew: TypedDataset[XAdded] = f.withColumn[XAdded](f('j) === 10)""" } } test("fail to compile on wrong typed column") { val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil) illTyped { """val fNew: TypedDataset[XWrongType] = f.withColumn[XWrongType](f('j) === 10)""" } } test("append four columns") { def prop[A: TypedEncoder](value: A): Prop = { val d = TypedDataset.create(X1(value) :: Nil) val d1 = d.withColumn[X2[A, A]](d('a)) val d2 = d1.withColumn[X3[A, A, A]](d1('b)) val d3 = d2.withColumn[X4[A, A, A, A]](d2('c)) val d4 = d3.withColumn[X5[A, A, A, A, A]](d3('d)) X5(value, value, value, value, value) ?= d4.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } test("update in place") { def prop[A : TypedEncoder](startValue: A, replaceValue: A): Prop = { val d = TypedDataset.create(X2(startValue, replaceValue) :: Nil) val X2(a, b) = d.withColumnReplaced('a, d('b)) .collect() .run() .head a ?= b } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } } object WithColumnTest { case class X(i: Int, j: Int) case class XMissing(i: Int, k: Boolean) case class XDifferentColumnName(i: Int, ji: Int, k: Boolean) case class XAdded(i: Int, j: Int, k: Boolean, l: Int) case class XWrongType(i: Int, j: Int, k: Int) case class XGood(i: Int, j: Int, k: Boolean) } ================================================ FILE: dataset/src/test/scala/frameless/WithColumnTupledTest.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class WithColumnTupledTest extends TypedDatasetSuite { test("append five columns") { def prop[A: TypedEncoder](value: A): Prop = { val d = TypedDataset.create(X1(value) :: Nil) val d1 = d.withColumnTupled(d('a)) val d2 = d1.withColumnTupled(d1('_1)) val d3 = d2.withColumnTupled(d2('_2)) val d4 = d3.withColumnTupled(d3('_3)) val d5 = d4.withColumnTupled(d4('_4)) (value, value, value, value, value, value) ?= d5.collect().run().head } check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[X1[Boolean]]] _) } } ================================================ FILE: dataset/src/test/scala/frameless/XN.scala ================================================ package frameless import org.scalacheck.{Arbitrary, Cogen} case class X1[A](a: A) object X1 { implicit def arbitrary[A: Arbitrary]: Arbitrary[X1[A]] = Arbitrary(implicitly[Arbitrary[A]].arbitrary.map(X1(_))) implicit def cogen[A](implicit A: Cogen[A]): Cogen[X1[A]] = A.contramap(_.a) implicit def ordering[A: Ordering]: Ordering[X1[A]] = Ordering[A].on(_.a) } case class X2[A, B](a: A, b: B) object X2 { implicit def arbitrary[A: Arbitrary, B: Arbitrary]: Arbitrary[X2[A, B]] = Arbitrary(Arbitrary.arbTuple2[A, B].arbitrary.map((X2.apply[A, B] _).tupled)) implicit def cogen[A, B](implicit A: Cogen[A], B: Cogen[B]): Cogen[X2[A, B]] = Cogen.tuple2(A, B).contramap(x => (x.a, x.b)) implicit def ordering[A: Ordering, B: Ordering]: Ordering[X2[A, B]] = Ordering.Tuple2[A, B].on(x => (x.a, x.b)) } case class X3[A, B, C](a: A, b: B, c: C) object X3 { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3[A, B, C]] = Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map((X3.apply[A, B, C] _).tupled)) implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3[A, B, C]] = Cogen.tuple3(A, B, C).contramap(x => (x.a, x.b, x.c)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3[A, B, C]] = Ordering.Tuple3[A, B, C].on(x => (x.a, x.b, x.c)) } case class X3U[A, B, C](a: A, b: B, u: Unit, c: C) object X3U { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3U[A, B, C]] = Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map(x => X3U[A, B, C](x._1, x._2, (), x._3))) implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3U[A, B, C]] = Cogen.tuple3(A, B, C).contramap(x => (x.a, x.b, x.c)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3U[A, B, C]] = Ordering.Tuple3[A, B, C].on(x => (x.a, x.b, x.c)) } case class X3KV[A, B, C](key: A, value: B, c: C) object X3KV { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3KV[A, B, C]] = Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map((X3KV.apply[A, B, C] _).tupled)) implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3KV[A, B, C]] = Cogen.tuple3(A, B, C).contramap(x => (x.key, x.value, x.c)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3KV[A, B, C]] = Ordering.Tuple3[A, B, C].on(x => (x.key, x.value, x.c)) } case class X4[A, B, C, D](a: A, b: B, c: C, d: D) object X4 { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary]: Arbitrary[X4[A, B, C, D]] = Arbitrary(Arbitrary.arbTuple4[A, B, C, D].arbitrary.map((X4.apply[A, B, C, D] _).tupled)) implicit def cogen[A, B, C, D](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D]): Cogen[X4[A, B, C, D]] = Cogen.tuple4(A, B, C, D).contramap(x => (x.a, x.b, x.c, x.d)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering]: Ordering[X4[A, B, C, D]] = Ordering.Tuple4[A, B, C, D].on(x => (x.a, x.b, x.c, x.d)) } case class X5[A, B, C, D, E](a: A, b: B, c: C, d: D, e: E) object X5 { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary, E: Arbitrary]: Arbitrary[X5[A, B, C, D, E]] = Arbitrary(Arbitrary.arbTuple5[A, B, C, D, E].arbitrary.map((X5.apply[A, B, C, D, E] _).tupled)) implicit def cogen[A, B, C, D, E](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D], E: Cogen[E]): Cogen[X5[A, B, C, D, E]] = Cogen.tuple5(A, B, C, D, E).contramap(x => (x.a, x.b, x.c, x.d, x.e)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering, E: Ordering]: Ordering[X5[A, B, C, D, E]] = Ordering.Tuple5[A, B, C, D, E].on(x => (x.a, x.b, x.c, x.d, x.e)) } case class X6[A, B, C, D, E, F](a: A, b: B, c: C, d: D, e: E, f: F) object X6 { implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary, E: Arbitrary, F: Arbitrary]: Arbitrary[X6[A, B, C, D, E, F]] = Arbitrary(Arbitrary.arbTuple6[A, B, C, D, E, F].arbitrary.map((X6.apply[A, B, C, D, E, F] _).tupled)) implicit def cogen[A, B, C, D, E, F](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D], E: Cogen[E], F: Cogen[F]): Cogen[X6[A, B, C, D, E, F]] = Cogen.tuple6(A, B, C, D, E, F).contramap(x => (x.a, x.b, x.c, x.d, x.e, x.f)) implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering, E: Ordering, F: Ordering]: Ordering[X6[A, B, C, D, E, F]] = Ordering.Tuple6[A, B, C, D, E, F].on(x => (x.a, x.b, x.c, x.d, x.e, x.f)) } ================================================ FILE: dataset/src/test/scala/frameless/forward/CheckpointTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop.{forAll, _} class CheckpointTests extends TypedDatasetSuite { test("checkpoint") { def prop[A: TypedEncoder](data: Vector[A], isEager: Boolean): Prop = { val dataset = TypedDataset.create(data) dataset.sparkSession.sparkContext.setCheckpointDir(TEST_OUTPUT_DIR) dataset.checkpoint(isEager).run().queryExecution.toString() =? dataset.dataset.checkpoint(isEager).queryExecution.toString() } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/ColumnsTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop.forAll class ColumnsTests extends TypedDatasetSuite { test("columns") { def prop(i: Int, s: String, b: Boolean, l: Long, d: Double, by: Byte): Prop = { val x1 = X1(i) :: Nil val x2 = X2(i, s) :: Nil val x3 = X3(i, s, b) :: Nil val x4 = X4(i, s, b, l) :: Nil val x5 = X5(i, s, b, l, d) :: Nil val x6 = X6(i, s, b, l, d, by) :: Nil val datasets = Seq(TypedDataset.create(x1), TypedDataset.create(x2), TypedDataset.create(x3), TypedDataset.create(x4), TypedDataset.create(x5), TypedDataset.create(x6)) Prop.all(datasets.flatMap { dataset => val columns = dataset.dataset.columns dataset.columns.map(col => Prop.propBoolean(columns contains col) ) }: _*) } check(forAll(prop _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/CountTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class CountTests extends TypedDatasetSuite { test("count") { def prop[A: TypedEncoder](data: Vector[A]): Prop = TypedDataset.create(data).count().run() ?= data.size.toLong check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/DistinctTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import math.Ordering class DistinctTests extends TypedDatasetSuite { test("distinct") { // Comparison done with `.sorted` because order is not preserved by Spark for this operation. def prop[A: TypedEncoder : Ordering](data: Vector[A]): Prop = TypedDataset.create(data).distinct.collect().run().toVector.sorted ?= data.distinct.sorted check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/ExceptTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class ExceptTests extends TypedDatasetSuite { test("except") { def prop[A: TypedEncoder](data1: Set[A], data2: Set[A]): Prop = { val dataset1 = TypedDataset.create(data1.toSeq) val dataset2 = TypedDataset.create(data2.toSeq) val datasetSubtract = dataset1.except(dataset2).collect().run().toVector val dataSubtract = data1.diff(data2) Prop.all( datasetSubtract.size ?= dataSubtract.size, datasetSubtract.toSet ?= dataSubtract ) } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/FirstTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers class FirstTests extends TypedDatasetSuite with Matchers { test("first") { def prop[A: TypedEncoder](data: Vector[A]): Prop = TypedDataset.create(data).firstOption().run() =? data.headOption check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("first on empty dataset should return None") { TypedDataset.create(Vector[Int]()).firstOption().run() shouldBe None } } ================================================ FILE: dataset/src/test/scala/frameless/forward/ForeachTests.scala ================================================ package frameless package forward import org.apache.spark.util.CollectionAccumulator import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.collection.JavaConverters._ class ForeachTests extends TypedDatasetSuite { test("foreach") { def prop[A: Ordering: TypedEncoder](data: Vector[A]): Prop = { val accu = new CollectionAccumulator[A]() sc.register(accu) TypedDataset.create(data).foreach(accu.add).run() accu.value.asScala.toVector.sorted ?= data.sorted } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("foreachPartition") { def prop[A: Ordering: TypedEncoder](data: Vector[A]): Prop = { val accu = new CollectionAccumulator[A]() sc.register(accu) TypedDataset.create(data).foreachPartition(_.foreach(accu.add)).run() accu.value.asScala.toVector.sorted ?= data.sorted } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/HeadTests.scala ================================================ package frameless.forward import frameless.{TypedDataset, TypedDatasetSuite, TypedEncoder, TypedExpressionEncoder, X1} import org.apache.spark.sql.SparkSession import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.reflect.ClassTag import org.scalatest.matchers.should.Matchers class HeadTests extends TypedDatasetSuite with Matchers { def propArray[A: TypedEncoder : ClassTag : Ordering](data: Vector[X1[A]])(implicit c: SparkSession): Prop = { import c.implicits._ if(data.nonEmpty) { val tds = TypedDataset. create(c.createDataset(data)( TypedExpressionEncoder.apply[X1[A]] ).orderBy($"a".desc)) (tds.headOption().run().get ?= data.max). &&(tds.head(1).run().head ?= data.max). &&(tds.head(4).run().toVector ?= data.sortBy(_.a)(implicitly[Ordering[A]].reverse).take(4)) } else Prop.passed } test("headOption(), head(1), and head(4)") { check(propArray[Int] _) check(propArray[Char] _) check(propArray[String] _) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/InputFilesTests.scala ================================================ package frameless import java.util.UUID import org.apache.spark.sql.SparkSession import org.scalacheck.Prop import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers class InputFilesTests extends TypedDatasetSuite with Matchers { test("inputFiles") { def propText[A: TypedEncoder](data: Vector[A]): Prop = { val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.txt" TypedDataset.create(data).dataset.write.text(filePath) val dataset = TypedDataset.create(implicitly[SparkSession].sparkContext.textFile(filePath)) dataset.inputFiles sameElements dataset.dataset.inputFiles } def propCsv[A: TypedEncoder](data: Vector[A]): Prop = { val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.csv" val inputDataset = TypedDataset.create(data) inputDataset.dataset.write.csv(filePath) val dataset = TypedDataset.createUnsafe( implicitly[SparkSession].sqlContext.read.schema(inputDataset.schema).csv(filePath)) dataset.inputFiles sameElements dataset.dataset.inputFiles } def propJson[A: TypedEncoder](data: Vector[A]): Prop = { val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.json" val inputDataset = TypedDataset.create(data) inputDataset.dataset.write.json(filePath) val dataset = TypedDataset.createUnsafe( implicitly[SparkSession].sqlContext.read.schema(inputDataset.schema).json(filePath)) dataset.inputFiles sameElements dataset.dataset.inputFiles } check(forAll(propText[String] _)) check(forAll(propCsv[String] _)) check(forAll(propJson[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/IntersectTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import math.Ordering class IntersectTests extends TypedDatasetSuite { test("intersect") { def prop[A: TypedEncoder : Ordering](data1: Vector[A], data2: Vector[A]): Prop = { val dataset1 = TypedDataset.create(data1) val dataset2 = TypedDataset.create(data2) val datasetIntersect = dataset1.intersect(dataset2).collect().run().toVector // Vector `intersect` is the multiset intersection, while Spark throws away duplicates. val dataIntersect = data1.intersect(data2).distinct // Comparison done with `.sorted` because order is not preserved by Spark for this operation. datasetIntersect.sorted ?= dataIntersect.distinct.sorted } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/IsLocalTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class IsLocalTests extends TypedDatasetSuite { test("isLocal") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create(data) dataset.isLocal ?= dataset.dataset.isLocal } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/IsStreamingTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class IsStreamingTests extends TypedDatasetSuite { test("isStreaming") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create(data) dataset.isStreaming ?= dataset.dataset.isStreaming } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/LimitTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class LimitTests extends TypedDatasetSuite { test("limit") { def prop[A: TypedEncoder](data: Vector[A], n: Int): Prop = (n >= 0) ==> { val dataset = TypedDataset.create(data).limit(n).collect().run() Prop.all( dataset.length ?= Math.min(data.length, n), dataset.forall(data.contains) ) } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/QueryExecutionTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop.{forAll, _} class QueryExecutionTests extends TypedDatasetSuite { test("queryExecution") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create[A](data) dataset.queryExecution =? dataset.dataset.queryExecution } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/RandomSplitTests.scala ================================================ package frameless import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen} import scala.collection.JavaConverters._ import org.scalatest.matchers.should.Matchers class RandomSplitTests extends TypedDatasetSuite with Matchers { val nonEmptyPositiveArray: Gen[Array[Double]] = Gen.nonEmptyListOf(Gen.posNum[Double]).map(_.toArray) test("randomSplit(weight, seed)") { def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], nonEmptyPositiveArray, arbitrary[Long]) { (data: Vector[A], weights: Array[Double], seed: Long) => val dataset = TypedDataset.create(data) dataset.randomSplit(weights, seed).map(_.count().run()) sameElements dataset.dataset.randomSplit(weights, seed).map(_.count()) } check(prop[Int]) check(prop[String]) } test("randomSplitAsList(weight, seed)") { def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], nonEmptyPositiveArray, arbitrary[Long]) { (data: Vector[A], weights: Array[Double], seed: Long) => val dataset = TypedDataset.create(data) dataset.randomSplitAsList(weights, seed).asScala.map(_.count().run()) sameElements dataset.dataset.randomSplitAsList(weights, seed).asScala.map(_.count()) } check(prop[Int]) check(prop[String]) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/SQLContextTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop.{forAll, _} class SQLContextTests extends TypedDatasetSuite { test("sqlContext") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create[A](data) dataset.sqlContext =? dataset.dataset.sqlContext } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/SparkSessionTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class SparkSessionTests extends TypedDatasetSuite { test("sparkSession") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create[A](data) dataset.sparkSession =? dataset.dataset.sparkSession } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/StorageLevelTests.scala ================================================ package frameless import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel._ import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen} class StorageLevelTests extends TypedDatasetSuite { val storageLevelGen: Gen[StorageLevel] = Gen.oneOf(Seq(NONE, DISK_ONLY, DISK_ONLY_2, MEMORY_ONLY, MEMORY_ONLY_2, MEMORY_ONLY_SER, MEMORY_ONLY_SER_2, MEMORY_AND_DISK, MEMORY_AND_DISK_2, MEMORY_AND_DISK_SER, MEMORY_AND_DISK_SER_2, OFF_HEAP)) test("storageLevel") { def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], storageLevelGen) { (data: Vector[A], storageLevel: StorageLevel) => val dataset = TypedDataset.create(data) if (storageLevel != StorageLevel.NONE) dataset.persist(storageLevel) dataset.count().run() dataset.storageLevel() ?= dataset.dataset.storageLevel } check(prop[Int]) check(prop[String]) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/TakeTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.reflect.ClassTag class TakeTests extends TypedDatasetSuite { test("take") { def prop[A: TypedEncoder](n: Int, data: Vector[A]): Prop = (n >= 0) ==> (TypedDataset.create(data).take(n).run().toVector =? data.take(n)) def propArray[A: TypedEncoder: ClassTag](n: Int, data: Vector[X1[Array[A]]]): Prop = (n >= 0) ==> { Prop { TypedDataset.create(data).take(n).run().toVector.zip(data.take(n)).forall { case (X1(l), X1(r)) => l sameElements r } } } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) check(forAll(propArray[Int] _)) check(forAll(propArray[String] _)) check(forAll(propArray[Byte] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/ToJSONTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ class ToJSONTests extends TypedDatasetSuite { test("toJSON") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create(data) dataset.toJSON.collect().run() ?= dataset.dataset.toJSON.collect() } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/ToLocalIteratorTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import scala.collection.JavaConverters._ import org.scalatest.matchers.should.Matchers class ToLocalIteratorTests extends TypedDatasetSuite with Matchers { test("toLocalIterator") { def prop[A: TypedEncoder](data: Vector[A]): Prop = { val dataset = TypedDataset.create(data) dataset.toLocalIterator().run().asScala.toIterator sameElements dataset.dataset.toLocalIterator().asScala.toIterator } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/UnionTests.scala ================================================ package frameless import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped class UnionTests extends TypedDatasetSuite { test("fail to compile on not aligned schema") { val dataset1 = TypedDataset.create(Foo(1, 1) :: Nil) val dataset2 = TypedDataset.create(Wrong(1, 1, 1) :: Nil) illTyped { """val fNew = dataset1 union dataset2 """ } } test("Union for simple data types") { def prop[A: TypedEncoder](data1: Vector[A], data2: Vector[A]): Prop = { val dataset1 = TypedDataset.create(data1) val dataset2 = TypedDataset.create(data2) val datasetUnion = dataset1.union(dataset2).collect().run().toVector val dataUnion = data1.union(data2) datasetUnion ?= dataUnion } check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("Align fields for case classes") { def prop[A: TypedEncoder, B: TypedEncoder](data1: Vector[(A, B)], data2: Vector[(A, B)]): Prop = { val dataset1 = TypedDataset.create(data1.map((Foo.apply[A, B] _).tupled)) val dataset2 = TypedDataset.create(data2.map { case (a, b) => Bar[A, B](b, a) }) val datasetUnion = dataset1.union(dataset2).collect().run().map(foo => (foo.x, foo.y)).toVector val dataUnion = data1 union data2 datasetUnion ?= dataUnion } check(forAll(prop[Int, String] _)) check(forAll(prop[String, X1[Option[Long]]] _)) } test("Align fields for different number of columns") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](data1: Vector[(A, B, C)], data2: Vector[(A, B)]): Prop = { val dataset1 = TypedDataset.create(data2.map((Foo.apply[A, B] _).tupled)) val dataset2 = TypedDataset.create(data1.map { case (a, b, c) => Baz[A, B, C](c, b, a) }) val datasetUnion: Seq[(A, B)] = dataset1.union(dataset2).collect().run().map(foo => (foo.x, foo.y)).toVector val dataUnion = data2 union data1.map { case (a, b, _) => (a, b) } datasetUnion ?= dataUnion } check(forAll(prop[Option[Int], String, Array[Long]] _)) check(forAll(prop[String, X1[Option[Int]], X2[String, Array[Int]]] _)) } } final case class Foo[A, B](x: A, y: B) final case class Bar[A, B](y: B, x: A) final case class Baz[A, B, C](z: C, y: B, x: A) final case class Wrong[A, B, C](a: A, b: B, c: C) ================================================ FILE: dataset/src/test/scala/frameless/forward/WriteStreamTests.scala ================================================ package frameless import java.util.UUID import org.apache.spark.sql.Encoder import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen, Prop} class WriteStreamTests extends TypedDatasetSuite { val genNested = for { d <- Arbitrary.arbitrary[Double] as <- Arbitrary.arbitrary[String] } yield Nested(d, as) val genOptionFieldsOnly = for { o1 <- Gen.option(Arbitrary.arbitrary[Int]) o2 <- Gen.option(genNested) } yield OptionFieldsOnly(o1, o2) val genWriteExample = for { i <- Arbitrary.arbitrary[Int] s <- Arbitrary.arbitrary[String] on <- Gen.option(genNested) ooo <- Gen.option(genOptionFieldsOnly) } yield WriteExample(i, s, on, ooo) test("write csv") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](data: List[A]): Prop = { val uid = UUID.randomUUID() val uidNoHyphens = uid.toString.replace("-", "") val filePath = s"$TEST_OUTPUT_DIR/$uid}" val checkpointPath = s"$TEST_OUTPUT_DIR/checkpoint/$uid" val inputStream = MemoryStream[A] val input = TypedDataset.create(inputStream.toDS()) val inputter = input.writeStream.format("csv").option("checkpointLocation", s"$checkpointPath/input").start(filePath) inputStream.addData(data) inputter.processAllAvailable() val dataset = TypedDataset.createUnsafe(sqlContext.readStream.schema(input.schema).csv(filePath)) val tester = dataset .writeStream .option("checkpointLocation", s"$checkpointPath/tester") .format("memory") .queryName(s"testCsv_$uidNoHyphens") .start() tester.processAllAvailable() val output = spark.table(s"testCsv_$uidNoHyphens").as[A] TypedDataset.create(data).collect().run().groupBy(identity) ?= output.collect().groupBy(identity).map { case (k, arr) => (k, arr.toSeq) } } check(forAll(Gen.nonEmptyListOf(Gen.alphaNumStr.suchThat(_.nonEmpty)))(prop[String])) check(forAll(Gen.nonEmptyListOf(Arbitrary.arbitrary[Int]))(prop[Int])) } test("write parquet") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](data: List[A]): Prop = { val uid = UUID.randomUUID() val uidNoHyphens = uid.toString.replace("-", "") val filePath = s"$TEST_OUTPUT_DIR/$uid}" val checkpointPath = s"$TEST_OUTPUT_DIR/checkpoint/$uid" val inputStream = MemoryStream[A] val input = TypedDataset.create(inputStream.toDS()) val inputter = input.writeStream.format("parquet").option("checkpointLocation", s"$checkpointPath/input").start(filePath) inputStream.addData(data) inputter.processAllAvailable() val dataset = TypedDataset.createUnsafe(sqlContext.readStream.schema(input.schema).parquet(filePath)) val tester = dataset .writeStream .option("checkpointLocation", s"$checkpointPath/tester") .format("memory") .queryName(s"testParquet_$uidNoHyphens") .start() tester.processAllAvailable() val output = spark.table(s"testParquet_$uidNoHyphens").as[A] TypedDataset.create(data).collect().run().groupBy(identity) ?= output.collect().groupBy(identity).map { case (k, arr) => (k, arr.toSeq) } } check(forAll(Gen.nonEmptyListOf(genWriteExample))(prop[WriteExample])) } } ================================================ FILE: dataset/src/test/scala/frameless/forward/WriteTests.scala ================================================ package frameless import java.util.UUID import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen, Prop} class WriteTests extends TypedDatasetSuite { val genNested = for { d <- Arbitrary.arbitrary[Double] as <- Arbitrary.arbitrary[String] } yield Nested(d, as) val genOptionFieldsOnly = for { o1 <- Gen.option(Arbitrary.arbitrary[Int]) o2 <- Gen.option(genNested) } yield OptionFieldsOnly(o1, o2) val genWriteExample = for { i <- Arbitrary.arbitrary[Int] s <- Arbitrary.arbitrary[String] on <- Gen.option(genNested) ooo <- Gen.option(genOptionFieldsOnly) } yield WriteExample(i, s, on, ooo) test("write csv") { def prop[A: TypedEncoder](data: List[A]): Prop = { val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}" val input = TypedDataset.create(data) input.write.csv(filePath) val dataset = TypedDataset.createUnsafe(sqlContext.read.schema(input.schema).csv(filePath)) dataset.collect().run().groupBy(identity) ?= input.collect().run().groupBy(identity) } check(forAll(Gen.listOf(Gen.alphaNumStr.suchThat(_.nonEmpty)))(prop[String])) check(forAll(prop[Int] _)) } test("write parquet") { def prop[A: TypedEncoder](data: List[A]): Prop = { val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}" val input = TypedDataset.create(data) input.write.parquet(filePath) val dataset = TypedDataset.createUnsafe(sqlContext.read.schema(input.schema).parquet(filePath)) dataset.collect().run().groupBy(identity) ?= input.collect().run().groupBy(identity) } check(forAll(Gen.listOf(genWriteExample))(prop[WriteExample])) } } case class Nested(i: Double, v: String) case class OptionFieldsOnly(o1: Option[Int], o2: Option[Nested]) case class WriteExample(i: Int, s: String, on: Option[Nested], ooo: Option[OptionFieldsOnly]) ================================================ FILE: dataset/src/test/scala/frameless/functions/AggregateFunctionsTests.scala ================================================ package frameless package functions import frameless.{TypedAggregate, TypedColumn} import frameless.functions.aggregate._ import org.apache.spark.sql.{Column, Encoder} import org.scalacheck.{Gen, Prop} import org.scalacheck.Prop._ import org.scalatest.exceptions.GeneratorDrivenPropertyCheckFailedException class AggregateFunctionsTests extends TypedDatasetSuite { def sparkSchema[A: TypedEncoder, U](f: TypedColumn[X1[A], A] => TypedAggregate[X1[A], U]): Prop = { val df = TypedDataset.create[X1[A]](Nil) val col = f(df.col('a)) val sumDf = df.agg(col) TypedExpressionEncoder.targetStructType(sumDf.encoder) ?= sumDf.dataset.schema } test("sum") { case class Sum4Tests[A, B](sum: Seq[A] => B) def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])( implicit summable: CatalystSummable[A, Out], summer: Sum4Tests[A, Out] ): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetSum: List[Out] = dataset.agg(sum(A)).collect().run().toList datasetSum match { case x :: Nil => approximatelyEqual(summer.sum(xs), x) case other => falsified } } // Replicate Spark's behaviour : Ints and Shorts are cast to Long // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L37 implicit def summerDecimal = Sum4Tests[BigDecimal, BigDecimal](_.sum) implicit def summerDouble = Sum4Tests[Double, Double](_.sum) implicit def summerLong = Sum4Tests[Long, Long](_.sum) implicit def summerInt = Sum4Tests[Int, Long](_.map(_.toLong).sum) implicit def summerShort = Sum4Tests[Short, Long](_.map(_.toLong).sum) check(forAll(prop[BigDecimal, BigDecimal] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[Double, Double] _)) check(forAll(prop[Int, Long] _)) check(forAll(prop[Short, Long] _)) check(sparkSchema[BigDecimal, BigDecimal](sum)) check(sparkSchema[Long, Long](sum)) check(sparkSchema[Int, Long](sum)) check(sparkSchema[Double, Double](sum)) check(sparkSchema[Short, Long](sum)) } test("sumDistinct") { case class Sum4Tests[A, B](sum: Seq[A] => B) def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])( implicit summable: CatalystSummable[A, Out], summer: Sum4Tests[A, Out] ): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetSum: List[Out] = dataset.agg(sumDistinct(A)).collect().run().toList datasetSum match { case x :: Nil => approximatelyEqual(summer.sum(xs), x) case other => falsified } } // Replicate Spark's behaviour : Ints and Shorts are cast to Long // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L37 implicit def summerLong = Sum4Tests[Long, Long](_.toSet.sum) implicit def summerInt = Sum4Tests[Int, Long]( x => x.toSet.map((_:Int).toLong).sum) implicit def summerShort = Sum4Tests[Short, Long](x => x.toSet.map((_:Short).toLong).sum) check(forAll(prop[Long, Long] _)) check(forAll(prop[Int, Long] _)) check(forAll(prop[Short, Long] _)) check(sparkSchema[Long, Long](sum)) check(sparkSchema[Int, Long](sum)) check(sparkSchema[Short, Long](sum)) } test("avg") { case class Averager4Tests[A, B](avg: Seq[A] => B) def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])( implicit averageable: CatalystAverageable[A, Out], averager: Averager4Tests[A, Out] ): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetAvg: Vector[Out] = dataset.agg(avg(A)).collect().run().toVector if (datasetAvg.size > 2) falsified else xs match { case Nil => datasetAvg ?= Vector() case _ :: _ => datasetAvg.headOption match { case Some(x) => approximatelyEqual(averager.avg(xs), x) case None => falsified } } } // Replicate Spark's behaviour : If the datatype isn't BigDecimal cast type to Double // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L50 implicit def averageDecimal = Averager4Tests[BigDecimal, BigDecimal](as => as.sum/as.size) implicit def averageDouble = Averager4Tests[Double, Double](as => as.sum/as.size) implicit def averageLong = Averager4Tests[Long, Double](as => as.map(_.toDouble).sum/as.size) implicit def averageInt = Averager4Tests[Int, Double](as => as.map(_.toDouble).sum/as.size) implicit def averageShort = Averager4Tests[Short, Double](as => as.map(_.toDouble).sum/as.size) /* under 3.4 an oddity was detected: Falsified after 2 successful property evaluations. Location: (AggregateFunctionsTests.scala:127) [info] Occurred when passed generated values ( [info] arg0 = List("-1", "9223372036854775807", "-9223372036854775808") [info] ) which is odd given it's strings and not the Long's that should have been there, but also not seemingly reproducible with just longs */ tolerantRun(_.isInstanceOf[GeneratorDrivenPropertyCheckFailedException]) { check(forAll(prop[BigDecimal, BigDecimal] _)) check(forAll(prop[Double, Double] _)) check(forAll(prop[Long, Double] _)) check(forAll(prop[Int, Double] _)) check(forAll(prop[Short, Double] _)) } } test("stddev and variance") { def prop[A: TypedEncoder : CatalystVariance : Numeric](xs: List[A]): Prop = { val numeric = implicitly[Numeric[A]] val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetStdOpt = dataset.agg(stddev(A)).collect().run().toVector.headOption val datasetVarOpt = dataset.agg(variance(A)).collect().run().toVector.headOption val std = sc.parallelize(xs.map(implicitly[Numeric[A]].toDouble)).sampleStdev() val `var` = sc.parallelize(xs.map(implicitly[Numeric[A]].toDouble)).sampleVariance() (datasetStdOpt, datasetVarOpt) match { case (Some(datasetStd), Some(datasetVar)) => approximatelyEqual(datasetStd, std) && approximatelyEqual(datasetVar, `var`) case _ => proved } } check(forAll(prop[Short] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Double] _)) } test("litAggr") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](xs: List[A], b: B, c: C): Prop = { val dataset = TypedDataset.create(xs) val (r1, rb, rc, rcount) = dataset.agg(count().lit(1), litAggr(b), litAggr(c), count()).collect().run().head (rcount ?= xs.size.toLong) && (r1 ?= 1) && (rb ?= b) && (rc ?= c) } check(forAll(prop[Boolean, Int, String] _)) check(forAll(prop[Option[Boolean], Vector[Option[Vector[Char]]], Long] _)) } test("count") { def prop[A: TypedEncoder](xs: List[A]): Prop = { val dataset = TypedDataset.create(xs) val Vector(datasetCount) = dataset.agg(count()).collect().run().toVector datasetCount ?= xs.size.toLong } check(forAll(prop[Int] _)) check(forAll(prop[Byte] _)) } test("count('a)") { def prop[A: TypedEncoder](xs: List[A]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetCount = dataset.agg(count(A)).collect().run() datasetCount ?= List(xs.size.toLong) } check(forAll(prop[Int] _)) check(forAll(prop[Byte] _)) } test("max") { def prop[A: TypedEncoder: CatalystOrdered](xs: List[A])(implicit o: Ordering[A]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetMax = dataset.agg(max(A)).collect().run().toList datasetMax ?= xs.reduceOption[A](o.max).toList } check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } test("max with follow up multiplication") { def prop(xs: List[Long]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[Long]('a) val datasetMax = dataset.agg(max(A) * 2).collect().run().headOption datasetMax ?= (if(xs.isEmpty) None else Some(xs.max * 2)) } check(forAll(prop _)) } test("min") { def prop[A: TypedEncoder: CatalystOrdered](xs: List[A])(implicit o: Ordering[A]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetMin = dataset.agg(min(A)).collect().run().toList datasetMin ?= xs.reduceOption[A](o.min).toList } check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } test("first") { def prop[A: TypedEncoder](xs: List[A]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetFirst = dataset.agg(first(A)).collect().run().toList datasetFirst ?= xs.headOption.toList } check(forAll(prop[BigDecimal] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } test("last") { def prop[A: TypedEncoder](xs: List[A]): Prop = { val dataset = TypedDataset.create(xs.map(X1(_))) val A = dataset.col[A]('a) val datasetLast = dataset.agg(last(A)).collect().run().toList datasetLast ?= xs.lastOption.toList } check(forAll(prop[BigDecimal] _)) check(forAll(prop[Long] _)) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } // Generator for simplified and focused aggregation examples def getLowCardinalityKVPairs: Gen[Vector[(Int, Int)]] = { val kvPairGen: Gen[(Int, Int)] = for { k <- Gen.const(1) // key v <- Gen.choose(10, 100) // values } yield (k, v) Gen.listOfN(200, kvPairGen).map(_.toVector) } test("countDistinct") { check { forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] => val tds = TypedDataset.create(xs) val tdsRes: Seq[(Int, Long)] = tds.groupBy(tds('_1)).agg(countDistinct(tds('_2))).collect().run() tdsRes.toMap ?= xs.groupBy(_._1).mapValues(_.map(_._2).distinct.size.toLong).toSeq.toMap } } } test("approxCountDistinct") { // Simple version of #approximatelyEqual() // Default maximum estimation error of HyperLogLog in Spark is 5% def approxEqual(actual: Long, estimated: Long, allowedDeviationPercentile: Double = 0.05): Boolean = { val delta: Long = Math.abs(actual - estimated) delta / actual.toDouble < allowedDeviationPercentile * 2 } check { forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] => val tds = TypedDataset.create(xs) val tdsRes: Seq[(Int, Long, Long)] = tds.groupBy(tds('_1)).agg(countDistinct(tds('_2)), approxCountDistinct(tds('_2))).collect().run() tdsRes.forall { case (_, v1, v2) => approxEqual(v1, v2) } } } check { forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] => val tds = TypedDataset.create(xs) val allowedError = 0.1 // 10% val tdsRes: Seq[(Int, Long, Long)] = tds.groupBy(tds('_1)).agg(countDistinct(tds('_2)), approxCountDistinct(tds('_2), allowedError)).collect().run() tdsRes.forall { case (_, v1, v2) => approxEqual(v1, v2, allowedError) } } } } test("collectList") { def prop[A: TypedEncoder : Ordering](xs: List[X2[A, A]]): Prop = { val tds = TypedDataset.create(xs) val tdsRes: Seq[(A, Vector[A])] = tds.groupBy(tds('a)).agg(collectList(tds('b))).collect().run() tdsRes.toMap.map { case (k, v) => k -> v.sorted } ?= xs.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).toVector.sorted } } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } test("collectSet") { def prop[A: TypedEncoder : Ordering](xs: List[X2[A, A]]): Prop = { val tds = TypedDataset.create(xs) val tdsRes: Seq[(A, Vector[A])] = tds.groupBy(tds('a)).agg(collectSet(tds('b))).collect().run() tdsRes.toMap.map { case (k, v) => k -> v.toSet } ?= xs.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).toSet } } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[Byte] _)) check(forAll(prop[String] _)) } test("lit") { def prop[A: TypedEncoder](xs: List[X1[A]], l: A): Prop = { val tds = TypedDataset.create(xs) tds.select(tds('a), lit(l)).collect().run() ?= xs.map(x => (x.a, l)) } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[Vector[Vector[Int]]] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Vector[Byte]] _)) check(forAll(prop[String] _)) check(forAll(prop[Vector[Long]] _)) check(forAll(prop[BigDecimal] _)) } def bivariatePropTemplate[A: TypedEncoder, B: TypedEncoder] ( xs: List[X3[Int, A, B]] ) ( framelessFun: (TypedColumn[X3[Int, A, B], A], TypedColumn[X3[Int, A, B], B]) => TypedAggregate[X3[Int, A, B], Option[Double]], sparkFun: (Column, Column) => Column ) ( implicit encEv: Encoder[(Int, A, B)], encEv2: Encoder[(Int,Option[Double])], evCanBeDoubleA: CatalystCast[A, Double], evCanBeDoubleB: CatalystCast[B, Double] ): Prop = { val tds = TypedDataset.create(xs) // Typed implementation of bivar stats function val tdBivar = tds.groupBy(tds('a)).agg(framelessFun(tds('b), tds('c))).deserialized.map(kv => (kv._1, kv._2.flatMap(DoubleBehaviourUtils.nanNullHandler)) ).collect().run() val cDF = session.createDataset(xs.map(x => (x.a, x.b, x.c))) // Comparison implementation of bivar stats functions val compBivar = cDF .groupBy(cDF("_1")) .agg(sparkFun(cDF("_2"), cDF("_3"))) .map( row => { val grp = row.getInt(0) (grp, DoubleBehaviourUtils.nanNullHandler(row.get(1))) } ) // Should be the same tdBivar.toMap ?= compBivar.collect().toMap } def univariatePropTemplate[A: TypedEncoder] ( xs: List[X2[Int, A]] ) ( framelessFun: (TypedColumn[X2[Int, A], A]) => TypedAggregate[X2[Int, A], Option[Double]], sparkFun: (Column) => Column ) ( implicit encEv: Encoder[(Int, A)], encEv2: Encoder[(Int,Option[Double])], evCanBeDoubleA: CatalystCast[A, Double] ): Prop = { val tds = TypedDataset.create(xs) //typed implementation of univariate stats function val tdUnivar = tds.groupBy(tds('a)).agg(framelessFun(tds('b))).deserialized.map(kv => (kv._1, kv._2.flatMap(DoubleBehaviourUtils.nanNullHandler)) ).collect().run() val cDF = session.createDataset(xs.map(x => (x.a, x.b))) // Comparison implementation of bivar stats functions val compUnivar = cDF .groupBy(cDF("_1")) .agg(sparkFun(cDF("_2"))) .map( row => { val grp = row.getInt(0) (grp, DoubleBehaviourUtils.nanNullHandler(row.get(1))) } ) // Should be the same tdUnivar.toMap ?= compUnivar.collect().toMap } test("corr") { val spark = session import spark.implicits._ def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])( implicit encEv: Encoder[(Int, A, B)], evCanBeDoubleA: CatalystCast[A, Double], evCanBeDoubleB: CatalystCast[B, Double] ): Prop = bivariatePropTemplate(xs)(corr[A,B,X3[Int, A, B]],org.apache.spark.sql.functions.corr) check(forAll(prop[Double, Double] _)) check(forAll(prop[Double, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[BigDecimal, Byte] _)) } test("covar_pop") { val spark = session import spark.implicits._ def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])( implicit encEv: Encoder[(Int, A, B)], evCanBeDoubleA: CatalystCast[A, Double], evCanBeDoubleB: CatalystCast[B, Double] ): Prop = bivariatePropTemplate(xs)( covarPop[A, B, X3[Int, A, B]], org.apache.spark.sql.functions.covar_pop ) check(forAll(prop[Double, Double] _)) check(forAll(prop[Double, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[BigDecimal, Byte] _)) } test("covar_samp") { val spark = session import spark.implicits._ def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])( implicit encEv: Encoder[(Int, A, B)], evCanBeDoubleA: CatalystCast[A, Double], evCanBeDoubleB: CatalystCast[B, Double] ): Prop = bivariatePropTemplate(xs)( covarSamp[A, B, X3[Int, A, B]], org.apache.spark.sql.functions.covar_samp ) check(forAll(prop[Double, Double] _)) check(forAll(prop[Double, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[BigDecimal, Byte] _)) } test("kurtosis") { val spark = session import spark.implicits._ def prop[A: TypedEncoder](xs: List[X2[Int, A]])( implicit encEv: Encoder[(Int, A)], evCanBeDoubleA: CatalystCast[A, Double] ): Prop = univariatePropTemplate(xs)( kurtosis[A, X2[Int, A]], org.apache.spark.sql.functions.kurtosis ) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) } test("skewness") { val spark = session import spark.implicits._ def prop[A: TypedEncoder](xs: List[X2[Int, A]])( implicit encEv: Encoder[(Int, A)], evCanBeDoubleA: CatalystCast[A, Double] ): Prop = univariatePropTemplate(xs)( skewness[A, X2[Int, A]], org.apache.spark.sql.functions.skewness ) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) } test("stddev_pop") { val spark = session import spark.implicits._ def prop[A: TypedEncoder](xs: List[X2[Int, A]])( implicit encEv: Encoder[(Int, A)], evCanBeDoubleA: CatalystCast[A, Double] ): Prop = univariatePropTemplate(xs)( stddevPop[A, X2[Int, A]], org.apache.spark.sql.functions.stddev_pop ) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) } test("stddev_samp") { val spark = session import spark.implicits._ def prop[A: TypedEncoder](xs: List[X2[Int, A]])( implicit encEv: Encoder[(Int, A)], evCanBeDoubleA: CatalystCast[A, Double] ): Prop = univariatePropTemplate(xs)( stddevSamp[A, X2[Int, A]], org.apache.spark.sql.functions.stddev_samp ) check(forAll(prop[Double] _)) check(forAll(prop[Int] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/functions/DateTimeStringBehaviourUtils.scala ================================================ package frameless.functions import org.apache.spark.sql.Row object DateTimeStringBehaviourUtils { val nullHandler: Row => Option[Int] = _.get(0) match { case i: Int => Some(i) case _ => None } } ================================================ FILE: dataset/src/test/scala/frameless/functions/DoubleBehaviourUtils.scala ================================================ package frameless package functions /** * Some statistical functions in Spark can result in Double, Double.NaN or Null. * This tends to break ?= of the property based testing. Use the nanNullHandler function * here to alleviate this by mapping this NaN and Null to None. This will result in * functioning comparison again. */ object DoubleBehaviourUtils { // Mapping with this function is needed because spark uses Double.NaN for some semantics in the // correlation function. ?= for prop testing will use == underlying and will break because Double.NaN != Double.NaN private val nanHandler: Double => Option[Double] = value => if (!value.equals(Double.NaN)) Option(value) else None // Making sure that null => None and does not result in 0.0d because of row.getAs[Double]'s use of .asInstanceOf val nanNullHandler: Any => Option[Double] = { case null => None case d: Double => nanHandler(d) case _ => ??? } } ================================================ FILE: dataset/src/test/scala/frameless/functions/NonAggregateFunctionsTests.scala ================================================ package frameless package functions import java.io.File import java.util.Base64 import java.nio.charset.StandardCharsets import frameless.functions.nonAggregate._ import org.apache.commons.io.FileUtils import org.apache.spark.sql.{Column, Encoder, SaveMode, functions => sparkFunctions} import org.scalacheck.Prop._ import org.scalacheck.{Arbitrary, Gen, Prop} import scala.annotation.nowarn class NonAggregateFunctionsTests extends TypedDatasetSuite { val testTempFiles = "target/testoutput" object NonNegativeGenerators { val doubleGen = for { s <- Gen.chooseNum(1, Int.MaxValue) e <- Gen.chooseNum(1, Int.MaxValue) res: Double = s.toDouble / e.toDouble } yield res val intGen: Gen[Int] = Gen.chooseNum(1, Int.MaxValue) val shortGen: Gen[Short] = Gen.chooseNum(1, Short.MaxValue) val longGen: Gen[Long] = Gen.chooseNum(1, Long.MaxValue) val byteGen: Gen[Byte] = Gen.chooseNum(1, Byte.MaxValue) } object NonNegativeArbitraryNumericValues { import NonNegativeGenerators._ implicit val arbInt: Arbitrary[Int] = Arbitrary(intGen) implicit val arbDouble: Arbitrary[Double] = Arbitrary(doubleGen) implicit val arbLong: Arbitrary[Long] = Arbitrary(longGen) implicit val arbShort: Arbitrary[Short] = Arbitrary(shortGen) implicit val arbByte: Arbitrary[Byte] = Arbitrary(byteGen) } private val base64Encoder = Base64.getEncoder private def base64X1String(x1: X1[String]): X1[String] = { def base64(str: String): String = base64Encoder.encodeToString(str.getBytes(StandardCharsets.UTF_8)) x1.copy(a = base64(x1.a)) } override def afterAll(): Unit = { FileUtils.deleteDirectory(new File(testTempFiles)) super.afterAll() } test("negate") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder](values: List[X1[A]])( implicit encX1:Encoder[X1[A]], catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, B]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.negate(cDS("a"))) .map(_.getAs[B](0)) .collect() .toList val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(negate(col)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Byte, Byte] _)) check(forAll(prop[Short, Short] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[BigDecimal, java.math.BigDecimal] _)) } test("not") { val spark = session import spark.implicits._ def prop(values: List[X1[Boolean]], fromBase: Int, toBase: Int)(implicit encX1:Encoder[X1[Boolean]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.not(cDS("a"))) .map(_.getAs[Boolean](0)) .collect() .toList val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(not(col)) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("conv") { val spark = session import spark.implicits._ def prop(values: List[X1[String]], fromBase: Int, toBase: Int)(implicit encX1:Encoder[X1[String]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.conv(cDS("a"), fromBase, toBase)) .map(_.getAs[String](0)) .collect() .toList val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(conv(col, fromBase, toBase)) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("degrees") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.degrees(cDS("a"))) .map(_.getAs[Double](0)) .collect() .toList val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(degrees(col)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Byte] _)) check(forAll(prop[Short] _)) check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[BigDecimal] _)) } def propBitShift[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]]) (typedCol: TypedColumn[X1[A], B], sparkFunc: (Column,Int) => Column, numBits: Int): Prop = { val spark = session import spark.implicits._ val resCompare = typedDS.dataset .select(sparkFunc($"a", numBits)) .map(_.getAs[B](0)) .collect() .toList val res = typedDS .select(typedCol) .collect() .run() .toList res ?= resCompare } test("shiftRightUnsigned") { val spark = session import spark.implicits._ @nowarn // supress sparkFunctions.shiftRightUnsigned call which is used to maintain Spark 3.1.x backwards compat def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder] (values: List[X1[A]], numBits: Int) (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propBitShift(typedDS)(shiftRightUnsigned(typedDS('a), numBits), sparkFunctions.shiftRightUnsigned, numBits) } check(forAll(prop[Byte, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[BigDecimal, Int] _)) } test("shiftRight") { val spark = session import spark.implicits._ @nowarn // supress sparkFunctions.shiftRight call which is used to maintain Spark 3.1.x backwards compat def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder] (values: List[X1[A]], numBits: Int) (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propBitShift(typedDS)(shiftRight(typedDS('a), numBits), sparkFunctions.shiftRight, numBits) } check(forAll(prop[Byte, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[BigDecimal, Int] _)) } test("shiftLeft") { val spark = session import spark.implicits._ @nowarn // supress sparkFunctions.shiftLeft call which is used to maintain Spark 3.1.x backwards compat def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder] (values: List[X1[A]], numBits: Int) (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propBitShift(typedDS)(shiftLeft(typedDS('a), numBits), sparkFunctions.shiftLeft, numBits) } check(forAll(prop[Byte, Int] _)) check(forAll(prop[Short, Int] _)) check(forAll(prop[Int, Int] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[BigDecimal, Int] _)) } test("ceil") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder] (values: List[X1[A]])( implicit catalystAbsolute: CatalystRound[A, B], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.ceil(cDS("a"))) .map(_.getAs[B](0)) .collect() .toList.map{ case bigDecimal : java.math.BigDecimal => bigDecimal.setScale(0) case other => other }.asInstanceOf[List[B]] val typedDS = TypedDataset.create(values) val res = typedDS .select(ceil(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int, Long] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[Short, Long] _)) check(forAll(prop[Double, Long] _)) check(forAll(prop[BigDecimal, java.math.BigDecimal] _)) } test("sha2") { val spark = session import spark.implicits._ def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = { Seq(224, 256, 384, 512).map { numBits => val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.sha2(cDS("a"), numBits)) .map(_.getAs[String](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(sha2(typedDS('a), numBits)) .collect() .run() .toList res ?= resCompare }.reduce(_ && _) } check(forAll(prop _)) } test("sha1") { val spark = session import spark.implicits._ def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.sha1(cDS("a"))) .map(_.getAs[String](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(sha1(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("crc32") { val spark = session import spark.implicits._ def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.crc32(cDS("a"))) .map(_.getAs[Long](0)) .collect() .toList val typedDS = TypedDataset.create(values) val res = typedDS .select(crc32(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("floor") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder] (values: List[X1[A]])( implicit catalystAbsolute: CatalystRound[A, B], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.floor(cDS("a"))) .map(_.getAs[B](0)) .collect() .toList.map{ case bigDecimal : java.math.BigDecimal => bigDecimal.setScale(0) case other => other }.asInstanceOf[List[B]] val typedDS = TypedDataset.create(values) val res = typedDS .select(floor(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int, Long] _)) check(forAll(prop[Long, Long] _)) check(forAll(prop[Short, Long] _)) check(forAll(prop[Double, Long] _)) check(forAll(prop[BigDecimal, java.math.BigDecimal] _)) } test("abs big decimal") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder, B: TypedEncoder: Encoder] (values: List[X1[A]]) ( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, B], encX1:Encoder[X1[A]] )= { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.abs(cDS("a"))) .map(_.getAs[B](0)) .collect().toList val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select( abs(col) ) .collect() .run() .toList res ?= resCompare } check(forAll(prop[BigDecimal, java.math.BigDecimal] _)) } test("abs") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder] (values: List[X1[A]]) ( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, A], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.abs(cDS("a"))) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(abs(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Double] _)) } def propTrigonometric[A: CatalystNumeric: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]]) (typedCol: TypedColumn[X1[A], Double], sparkFunc: Column => Column): Prop = { val spark = session import spark.implicits._ val resCompare = typedDS.dataset .select(sparkFunc($"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(typedCol) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList res ?= resCompare } test("cos") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(cos(typedDS('a)), sparkFunctions.cos) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("cosh") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(cosh(typedDS('a)), sparkFunctions.cosh) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("acos") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(acos(typedDS('a)), sparkFunctions.acos) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("signum") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(signum(typedDS('a)), sparkFunctions.signum) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("sin") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(sin(typedDS('a)), sparkFunctions.sin) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("sinh") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(sinh(typedDS('a)), sparkFunctions.sinh) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("asin") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(asin(typedDS('a)), sparkFunctions.asin) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("tan") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(tan(typedDS('a)), sparkFunctions.tan) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("tanh") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]]) (implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) propTrigonometric(typedDS)(tanh(typedDS('a)), sparkFunctions.tanh) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } /* * Currently not all Collection types play nice with the Encoders. * This test needs to be readressed and Set readded to the Collection Typeclass once these issues are resolved. * * [[https://issues.apache.org/jira/browse/SPARK-18891]] * [[https://issues.apache.org/jira/browse/SPARK-21204]] */ test("arrayContains"){ val spark = session import spark.implicits._ val listLength = 10 val idxs = Stream.continually(Range(0, listLength)).flatten.toIterator abstract class Nth[A, C[A]:CatalystCollection] { def nth(c:C[A], idx:Int):A } implicit def deriveListNth[A] : Nth[A, List] = new Nth[A, List] { override def nth(c: List[A], idx: Int): A = c(idx) } implicit def deriveSeqNth[A] : Nth[A, Seq] = new Nth[A, Seq] { override def nth(c: Seq[A], idx: Int): A = c(idx) } implicit def deriveVectorNth[A] : Nth[A, Vector] = new Nth[A, Vector] { override def nth(c: Vector[A], idx: Int): A = c(idx) } implicit def deriveArrayNth[A] : Nth[A, Array] = new Nth[A, Array] { override def nth(c: Array[A], idx: Int): A = c(idx) } def prop[C[_] : CatalystCollection] ( values: C[Int], shouldBeIn:Boolean) ( implicit nth:Nth[Int, C], encEv: Encoder[C[Int]], tEncEv: TypedEncoder[C[Int]] ) = { val contained = if (shouldBeIn) nth.nth(values, idxs.next) else -1 val cDS = session.createDataset(List(values)) val resCompare = cDS .select(sparkFunctions.array_contains(cDS("value"), contained)) .map(_.getAs[Boolean](0)) .collect().toList val typedDS = TypedDataset.create(List(X1(values))) val res = typedDS .select(arrayContains(typedDS('a), contained)) .collect() .run() .toList res ?= resCompare } check( forAll( Gen.listOfN(listLength, Gen.choose(0,100)), Gen.oneOf(true,false) ) (prop[List]) ) /*check( Looks like there is no Typed Encoder for Seq type yet forAll( Gen.listOfN(listLength, Gen.choose(0,100)), Gen.oneOf(true,false) ) (prop[Seq]) )*/ check( forAll( Gen.listOfN(listLength, Gen.choose(0,100)).map(_.toVector), Gen.oneOf(true,false) ) (prop[Vector]) ) check( forAll( Gen.listOfN(listLength, Gen.choose(0,100)).map(_.toArray), Gen.oneOf(true,false) ) (prop[Array]) ) } test("atan") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder] (na: A, values: List[X1[A]])(implicit encX1: Encoder[X1[A]]) = { val cDS = session.createDataset(X1(na) :: values) val resCompare = cDS .select(sparkFunctions.atan(cDS("a"))) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val typedDS = TypedDataset.create(cDS) val res = typedDS .select(atan(typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val aggrTyped = typedDS.agg(atan( frameless.functions.aggregate.first(typedDS('a))) ).firstOption().run().get val aggrSpark = cDS.select( sparkFunctions.atan(sparkFunctions.first("a")).as[Double] ).first() (res ?= resCompare).&&(aggrTyped ?= aggrSpark) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("atan2") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder, B: CatalystNumeric : TypedEncoder : Encoder](na: X2[A, B], values: List[X2[A, B]]) (implicit encEv: Encoder[X2[A,B]]) = { val cDS = session.createDataset(na +: values) val resCompare = cDS .select(sparkFunctions.atan2(cDS("a"), cDS("b"))) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val typedDS = TypedDataset.create(cDS) val res = typedDS .select(atan2(typedDS('a), typedDS('b))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val aggrTyped = typedDS.agg(atan2( frameless.functions.aggregate.first(typedDS('a)), frameless.functions.aggregate.first(typedDS('b))) ).firstOption().run().get val aggrSpark = cDS.select( sparkFunctions.atan2(sparkFunctions.first("a"),sparkFunctions.first("b")).as[Double] ).first() (res ?= resCompare).&&(aggrTyped ?= aggrSpark) } check(forAll(prop[Int, Long] _)) check(forAll(prop[Long, Int] _)) check(forAll(prop[Short, Byte] _)) check(forAll(prop[BigDecimal, Double] _)) check(forAll(prop[Byte, Int] _)) check(forAll(prop[Double, Double] _)) } test("atan2LitLeft") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder] (na: X1[A], value: List[X1[A]], lit:Double)(implicit encX1:Encoder[X1[A]]) = { val cDS = session.createDataset(na +: value) val resCompare = cDS .select(sparkFunctions.atan2(lit, cDS("a"))) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val typedDS = TypedDataset.create(cDS) val res = typedDS .select(atan2(lit, typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val aggrTyped = typedDS.agg(atan2( lit, frameless.functions.aggregate.first(typedDS('a))) ).firstOption().run().get val aggrSpark = cDS.select( sparkFunctions.atan2(lit, sparkFunctions.first("a")).as[Double] ).first() (res ?= resCompare).&&(aggrTyped ?= aggrSpark) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("atan2LitRight") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder] (na: X1[A], value: List[X1[A]], lit:Double)(implicit encX1:Encoder[X1[A]]) = { val cDS = session.createDataset(na +: value) val resCompare = cDS .select(sparkFunctions.atan2(cDS("a"), lit)) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val typedDS = TypedDataset.create(cDS) val res = typedDS .select(atan2(typedDS('a), lit)) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val aggrTyped = typedDS.agg(atan2( frameless.functions.aggregate.first(typedDS('a)), lit) ).firstOption().run().get val aggrSpark = cDS.select( sparkFunctions.atan2(sparkFunctions.first("a"), lit).as[Double] ).first() (res ?= resCompare).&&(aggrTyped ?= aggrSpark) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } def mathProp[A: CatalystNumeric: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]])( typedCol: TypedColumn[X1[A], Double], sparkFunc: Column => Column ): Prop = { val spark = session import spark.implicits._ val resCompare = typedDS.dataset .select(sparkFunc($"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(typedCol) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList res ?= resCompare } test("sqrt") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(sqrt(typedDS('a)), sparkFunctions.sqrt) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("crbt") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(cbrt(typedDS('a)), sparkFunctions.cbrt) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("exp") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(exp(typedDS('a)), sparkFunctions.exp) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[BigDecimal] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("md5") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]]): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.md5($"a")) .map(_.getAs[String](0)) .collect().toList val res = typedDS .select(md5(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[String] _)) } test("factorial") { val spark = session def prop(values: List[X1[Long]]): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.factorial($"a")) .map(_.getAs[Long](0)) .collect().toList val res = typedDS .select(factorial(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("round") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]])( implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.round(cDS("a"))) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(round(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Double] _)) } test("round big decimal") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](values: List[X1[A]])( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal], encX1:Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.round(cDS("a"))) .map(_.getAs[java.math.BigDecimal](0)) .collect() .toList.map(_.setScale(0)) val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(round(col)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[BigDecimal] _)) } test("round with scale") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]])( implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.round(cDS("a"), 1)) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(round(typedDS('a), 1)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Double] _)) } test("round big decimal with scale") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](values: List[X1[A]])( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal], encX1:Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.round(cDS("a"), 0)) .map(_.getAs[java.math.BigDecimal](0)) .collect() .toList.map(_.setScale(0)) val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(round(col, 0)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[BigDecimal] _)) } test("bround") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]])( implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bround(cDS("a"))) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(bround(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Double] _)) } test("bround big decimal") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](values: List[X1[A]])( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal], encX1:Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bround(cDS("a"))) .map(_.getAs[java.math.BigDecimal](0)) .collect() .toList.map(_.setScale(0)) val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(bround(col)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[BigDecimal] _)) } test("bround with scale") { val spark = session import spark.implicits._ def prop[A: TypedEncoder : Encoder](values: List[X1[A]])( implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A], encX1: Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bround(cDS("a"), 1)) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(bround(typedDS('a), 1)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Double] _)) } test("bround big decimal with scale") { val spark = session import spark.implicits._ def prop[A: TypedEncoder: Encoder](values: List[X1[A]])( implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal], encX1:Encoder[X1[A]] ) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bround(cDS("a"), 0)) .map(_.getAs[java.math.BigDecimal](0)) .collect() .toList.map(_.setScale(0)) val typedDS = TypedDataset.create(values) val col = typedDS('a) val res = typedDS .select(bround(col, 0)) .collect() .run() .toList res ?= resCompare } check(forAll(prop[BigDecimal] _)) } test("log with base") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X1[A]], base: Double ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.log(base, $"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(log(base, typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("log") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(log(typedDS('a)), sparkFunctions.log) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("log2") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(log2(typedDS('a)), sparkFunctions.log2) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("log1p") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(log1p(typedDS('a)), sparkFunctions.log1p) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("log10") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val typedDS = TypedDataset.create(values) mathProp(typedDS)(log10(typedDS('a)), sparkFunctions.log10) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("base64") { val spark = session import spark.implicits._ def prop(values:List[X1[Array[Byte]]])(implicit encX1:Encoder[X1[Array[Byte]]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.base64(cDS("a"))) .map(_.getAs[String](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(base64(typedDS('a))) .collect() .run() .toList val backAndForth = typedDS .select(base64(unbase64(base64(typedDS('a))))) .collect() .run() .toList (res ?= resCompare) && (res ?= backAndForth) } check(forAll(prop _)) } test("hypot with double") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X1[A]], base: Double ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.hypot(base, $"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res2 = typedDS .select(hypot(typedDS('a), base)) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val res = typedDS .select(hypot(base, typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList (res ?= resCompare) && (res2 ?= resCompare) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("hypot with two columns") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X2[A, A]] ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.hypot($"b", $"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(hypot(typedDS('b), typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("pow with double") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X1[A]], base: Double ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.pow(base, $"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(pow(base, typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList val resCompare2 = typedDS.dataset .select(sparkFunctions.pow($"a", base)) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res2 = typedDS .select(pow(typedDS('a), base)) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList (res ?= resCompare) && (res2 ?= resCompare2) } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("pow with two columns") { val spark = session import spark.implicits._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X2[A, A]] ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.pow($"b", $"a")) .map(_.getAs[Double](0)) .map(DoubleBehaviourUtils.nanNullHandler) .collect().toList val res = typedDS .select(pow(typedDS('b), typedDS('a))) .deserialized .map(DoubleBehaviourUtils.nanNullHandler) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("pmod") { val spark = session import spark.implicits._ import NonNegativeArbitraryNumericValues._ def prop[A: CatalystNumeric: TypedEncoder : Encoder]( values: List[X2[A, A]] ): Prop = { val spark = session import spark.implicits._ val typedDS = TypedDataset.create(values) val resCompare = typedDS.dataset .select(sparkFunctions.pmod($"b", $"a")) .map(_.getAs[A](0)) .collect().toList val res = typedDS .select(pmod(typedDS('b), typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Int] _)) check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Double] _)) } test("unbase64") { val spark = session import spark.implicits._ def prop(values: List[X1[String]])(implicit encX1: Encoder[X1[String]]) = { val valuesBase64 = values.map(base64X1String) val cDS = session.createDataset(valuesBase64) val resCompare = cDS .select(sparkFunctions.unbase64(cDS("a"))) .map(_.getAs[Array[Byte]](0)) .collect().toList val typedDS = TypedDataset.create(valuesBase64) val res = typedDS .select(unbase64(typedDS('a))) .collect() .run() .toList res.map(_.toList) ?= resCompare.map(_.toList) } check(forAll(prop _)) } test("bin"){ val spark = session import spark.implicits._ def prop(values:List[X1[Long]])(implicit encX1:Encoder[X1[Long]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bin(cDS("a"))) .map(_.getAs[String](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(bin(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop _)) } test("bitwiseNOT"){ val spark = session import spark.implicits._ @nowarn // supress sparkFunctions.bitwiseNOT call which is used to maintain Spark 3.1.x backwards compat def prop[A: CatalystBitwise : TypedEncoder : Encoder] (values:List[X1[A]])(implicit encX1:Encoder[X1[A]]) = { val cDS = session.createDataset(values) val resCompare = cDS .select(sparkFunctions.bitwiseNOT(cDS("a"))) .map(_.getAs[A](0)) .collect().toList val typedDS = TypedDataset.create(values) val res = typedDS .select(bitwiseNOT(typedDS('a))) .collect() .run() .toList res ?= resCompare } check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Int] _)) } test("inputFileName") { val spark = session import spark.implicits._ def prop[A : TypedEncoder]( toFile1: List[X1[A]], toFile2: List[X1[A]], inMem: List[X1[A]] )(implicit x2Gen: Encoder[X2[A, String]], x3Gen: Encoder[X3[A, String, String]]) = { val file1Path = testTempFiles + "/file1" val file2Path = testTempFiles + "/file2" val toFile1WithName = toFile1.map(x => X2(x.a, "file1")) val toFile2WithName = toFile2.map(x => X2(x.a, "file2")) val inMemWithName = inMem.map(x => X2(x.a, "")) toFile1WithName.toDS().write.mode(SaveMode.Overwrite).parquet(file1Path) toFile2WithName.toDS().write.mode(SaveMode.Overwrite).parquet(file2Path) val readBackIn1 = spark.read.parquet(file1Path).as[X2[A, String]] val readBackIn2 = spark.read.parquet(file2Path).as[X2[A, String]] val ds1 = TypedDataset.create(readBackIn1) val ds2 = TypedDataset.create(readBackIn2) val ds3 = TypedDataset.create(inMemWithName) val unioned = ds1.union(ds2).union(ds3) val withFileName = unioned.withColumn[X3[A, String, String]](inputFileName[X2[A, String]]()) .collect() .run() .toVector val grouped = withFileName.groupBy(_.b).mapValues(_.map(_.c).toSet) grouped.foldLeft(passed) { (p, g) => p && secure { g._1 match { case "" => g._2.head == "" //Empty string if didn't come from file case f => g._2.forall(_.contains(f)) }}} } check(forAll(prop[String] _)) } test("monotonic id") { val spark = session import spark.implicits._ def prop[A : TypedEncoder](xs: List[X1[A]])(implicit x2en: Encoder[X2[A, Long]]) = { val ds = TypedDataset.create(xs) val result = ds.withColumn[X2[A, Long]](monotonicallyIncreasingId()) .collect() .run() .toVector val ids = result.map(_.b) (ids.toSet.size ?= ids.length) && (ids.sorted ?= ids) } check(forAll(prop[String] _)) } test("when") { val spark = session import spark.implicits._ def prop[A : TypedEncoder : Encoder] (condition1: Boolean, condition2: Boolean, value1: A, value2: A, otherwise: A) = { val ds = TypedDataset.create(X5(condition1, condition2, value1, value2, otherwise) :: Nil) val untypedWhen = ds.toDF() .select( sparkFunctions.when(sparkFunctions.col("a"), sparkFunctions.col("c")) .when(sparkFunctions.col("b"), sparkFunctions.col("d")) .otherwise(sparkFunctions.col("e")) ) .as[A] .collect() .toList val typedWhen = ds .select( when(ds('a), ds('c)) .when(ds('b), ds('d)) .otherwise(ds('e)) ) .collect() .run() .toList typedWhen ?= untypedWhen } check(forAll(prop[Long] _)) check(forAll(prop[Short] _)) check(forAll(prop[Byte] _)) check(forAll(prop[Int] _)) check(forAll(prop[Option[Int]] _)) } test("ascii") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.ascii($"a")) .map(_.getAs[Int](0)) .collect() .toVector val typed = ds .select(ascii(ds('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("concat") { val spark = session import spark.implicits._ val pairs = for { y <- Gen.alphaStr x <- Gen.nonEmptyListOf(X2(y, y)) } yield x check(forAll(pairs) { values: List[X2[String, String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.concat($"a", $"b")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(concat(ds('a), ds('b))) .collect() .run() .toVector (typed ?= sparkResult).&&(typed ?= values.map(x => s"${x.a}${x.b}").toVector) }) } test("concat for TypedAggregate") { val spark = session import frameless.functions.aggregate._ import spark.implicits._ val pairs = for { y <- Gen.alphaStr x <- Gen.nonEmptyListOf(X2(y, y)) } yield x check(forAll(pairs) { values: List[X2[String, String]] => val ds = TypedDataset.create(values) val td = ds.agg(concat(first(ds('a)),first(ds('b)))).collect().run().toVector val spark = ds.dataset.select(sparkFunctions.concat( sparkFunctions.first($"a").as[String], sparkFunctions.first($"b").as[String])).as[String].collect().toVector td ?= spark }) } test("concat_ws") { val spark = session import spark.implicits._ val pairs = for { y <- Gen.alphaStr x <- Gen.nonEmptyListOf(X2(y, y)) } yield x check(forAll(pairs) { values: List[X2[String, String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.concat_ws(",", $"a", $"b")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(concatWs(",", ds('a), ds('b))) .collect() .run() .toVector typed ?= sparkResult }) } test("concat_ws for TypedAggregate") { val spark = session import frameless.functions.aggregate._ import spark.implicits._ val pairs = for { y <- Gen.alphaStr x <- Gen.listOfN(10, X2(y, y)) } yield x check(forAll(pairs) { values: List[X2[String, String]] => val ds = TypedDataset.create(values) val td = ds.agg(concatWs(",",first(ds('a)),first(ds('b)), last(ds('b)))).collect().run().toVector val spark = ds.dataset.select(sparkFunctions.concat_ws(",", sparkFunctions.first($"a").as[String], sparkFunctions.first($"b").as[String], sparkFunctions.last($"b").as[String])).as[String].collect().toVector td ?= spark }) } test("instr") { val spark = session import spark.implicits._ check(forAll(Gen.nonEmptyListOf(Gen.alphaStr)) { values: List[String] => val ds = TypedDataset.create(values.map(x => X1(x + values.head))) val sparkResult = ds.toDF() .select(sparkFunctions.instr($"a", values.head)) .map(_.getAs[Int](0)) .collect() .toVector val typed = ds .select(instr(ds('a), values.head)) .collect() .run() .toVector typed ?= sparkResult }) } test("length") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.length($"a")) .map(_.getAs[Int](0)) .collect() .toVector val typed = ds .select(length(ds[String]('a))) .collect() .run() .toVector (typed ?= sparkResult).&&(values.map(_.a.length).toVector ?= typed) }) } test("levenshtein") { val spark = session import spark.implicits._ check(forAll { (na: X1[String], values: List[X1[String]]) => val ds = TypedDataset.create(na +: values) val sparkResult = ds.toDF() .select(sparkFunctions.levenshtein($"a", sparkFunctions.concat($"a",sparkFunctions.lit("Hello")))) .map(_.getAs[Int](0)) .collect() .toVector val typed = ds .select(levenshtein(ds('a), concat(ds('a),lit("Hello")))) .collect() .run() .toVector val cDS = ds.dataset val aggrTyped = ds.agg( levenshtein(frameless.functions.aggregate.first(ds('a)), litAggr("Hello")) ).firstOption().run().get val aggrSpark = cDS.select( sparkFunctions.levenshtein(sparkFunctions.first("a"), sparkFunctions.lit("Hello")).as[Int] ).first() (typed ?= sparkResult).&&(aggrTyped ?= aggrSpark) }) } test("regexp_replace") { val spark = session import spark.implicits._ check(forAll { (values: List[X1[String]], n: Int) => val ds = TypedDataset.create(values.map(x => X1(s"$n${x.a}-$n$n"))) val sparkResult = ds.toDF() .select(sparkFunctions.regexp_replace($"a", "\\d+", "n")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(regexpReplace(ds[String]('a), "\\d+".r, "n")) .collect() .run() .toVector typed ?= sparkResult }) } test("reverse") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.reverse($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(reverse(ds[String]('a))) .collect() .run() .toVector (typed ?= sparkResult).&&(values.map(_.a.reverse).toVector ?= typed) }) } test("rpad") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.rpad($"a", 5, "hello")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(rpad(ds[String]('a), 5, "hello")) .collect() .run() .toVector typed ?= sparkResult }) } test("lpad") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.lpad($"a", 5, "hello")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(lpad(ds[String]('a), 5, "hello")) .collect() .run() .toVector typed ?= sparkResult }) } test("rtrim") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values.map(x => X1(s" ${x.a} "))) val sparkResult = ds.toDF() .select(sparkFunctions.rtrim($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(rtrim(ds[String]('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("ltrim") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values.map(x => X1(s" ${x.a} "))) val sparkResult = ds.toDF() .select(sparkFunctions.ltrim($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(ltrim(ds[String]('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("substring") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values) val sparkResult = ds.toDF() .select(sparkFunctions.substring($"a", 5, 3)) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(substring(ds[String]('a), 5, 3)) .collect() .run() .toVector typed ?= sparkResult }) } test("trim") { val spark = session import spark.implicits._ check(forAll { values: List[X1[String]] => val ds = TypedDataset.create(values.map(x => X1(s" ${x.a} "))) val sparkResult = ds.toDF() .select(sparkFunctions.trim($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(trim(ds[String]('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("upper") { val spark = session import spark.implicits._ check(forAll(Gen.listOf(Gen.alphaStr)) { values: List[String] => val ds = TypedDataset.create(values.map(X1(_))) val sparkResult = ds.toDF() .select(sparkFunctions.upper($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(upper(ds[String]('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("lower") { val spark = session import spark.implicits._ check(forAll(Gen.listOf(Gen.alphaStr)) { values: List[String] => val ds = TypedDataset.create(values.map(X1(_))) val sparkResult = ds.toDF() .select(sparkFunctions.lower($"a")) .map(_.getAs[String](0)) .collect() .toVector val typed = ds .select(lower(ds[String]('a))) .collect() .run() .toVector typed ?= sparkResult }) } test("Empty vararg tests") { def prop[A : TypedEncoder, B: TypedEncoder](data: Vector[X2[A, B]]) = { val ds = TypedDataset.create(data) val frameless = ds.select(ds('a), concat(), ds('b), concatWs(":")).collect().run().toVector val framelessAggr = ds.agg(concat(), concatWs("x"), litAggr(2)).collect().run().toVector val scala = data.map(x => (x.a, "", x.b, "")) val scalaAggr = Vector(("", "", 2)) (frameless ?= scala).&&(framelessAggr ?= scalaAggr) } check(forAll(prop[Long, Long] _)) check(forAll(prop[Option[Boolean], Long] _)) } def dateTimeStringProp(typedDS: TypedDataset[X1[String]]) (typedCol: TypedColumn[X1[String], Option[Int]], sparkFunc: Column => Column): Prop = { val spark = session import spark.implicits._ val sparkResult = typedDS.dataset .select(sparkFunc($"a")) .map(DateTimeStringBehaviourUtils.nullHandler) .collect() .toList val typed = typedDS .select(typedCol) .collect() .run() .toList typed ?= sparkResult } test("year") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(year(ds[String]('a)), sparkFunctions.year) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("quarter") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(quarter(ds[String]('a)), sparkFunctions.quarter) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("month") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(month(ds[String]('a)), sparkFunctions.month) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("dayofweek") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(dayofweek(ds[String]('a)), sparkFunctions.dayofweek) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("dayofmonth") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(dayofmonth(ds[String]('a)), sparkFunctions.dayofmonth) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("dayofyear") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(dayofyear(ds[String]('a)), sparkFunctions.dayofyear) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("hour") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(hour(ds[String]('a)), sparkFunctions.hour) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("minute") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(minute(ds[String]('a)), sparkFunctions.minute) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("second") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(second(ds[String]('a)), sparkFunctions.second) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } test("weekofyear") { val spark = session import spark.implicits._ def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = { val ds = TypedDataset.create(data) dateTimeStringProp(ds)(weekofyear(ds[String]('a)), sparkFunctions.weekofyear) } check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply)))) check(forAll(prop _)) } } ================================================ FILE: dataset/src/test/scala/frameless/functions/UdfTests.scala ================================================ package frameless package functions import org.scalacheck.Prop import org.scalacheck.Prop._ class UdfTests extends TypedDatasetSuite { test("one argument udf") { def prop[A: TypedEncoder, B: TypedEncoder](data: Vector[X1[A]], f1: A => B): Prop = { val dataset: TypedDataset[X1[A]] = TypedDataset.create(data) val u1 = udf[X1[A], A, B](f1) val u2 = dataset.makeUDF(f1) val A = dataset.col[A]('a) // filter forces whole codegen val codegen = dataset.deserialized.filter((_:X1[A]) => true).select(u1(A)).collect().run().toVector // otherwise it uses local relation val local = dataset.select(u2(A)).collect().run().toVector val d = data.map(x => f1(x.a)) (codegen ?= d) && (local ?= d) } check(forAll(prop[Int, Int] _)) check(forAll(prop[String, String] _)) check(forAll(prop[Option[Int], Option[Int]] _)) check(forAll(prop[X1[Int], X1[Int]] _)) check(forAll(prop[X1[Option[Int]], X1[Option[Int]]] _)) // TODO doesn't work for the same reason as `collect` // check(forAll(prop[X1[Option[X1[Int]]], X1[Option[X1[Option[Int]]]]] _)) check(forAll(prop[Option[Vector[String]], Option[Vector[String]]] _)) def prop2[A: TypedEncoder, B: TypedEncoder](f: A => B)(a: A): Prop = prop(Vector(X1(a)), f) check(forAll(prop2[Int, Option[Int]](x => if (x % 2 == 0) Some(x) else None) _)) check(forAll(prop2[Option[Int], Int](x => x getOrElse 0) _)) } test("multiple one argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder] (data: Vector[X3[A, B, C]], f1: A => A, f2: B => B, f3: C => C): Prop = { val dataset = TypedDataset.create(data) val u11 = udf[X3[A, B, C], A, A](f1) val u21 = udf[X3[A, B, C], B, B](f2) val u31 = udf[X3[A, B, C], C, C](f3) val u12 = dataset.makeUDF(f1) val u22 = dataset.makeUDF(f2) val u32 = dataset.makeUDF(f3) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val dataset21 = dataset.select(u11(A), u21(B), u31(C)).collect().run().toVector val dataset22 = dataset.select(u12(A), u22(B), u32(C)).collect().run().toVector val d = data.map(x => (f1(x.a), f2(x.b), f3(x.c))) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int] _)) check(forAll(prop[String, Int, Int] _)) check(forAll(prop[X3[Int, String, Boolean], Int, Int] _)) check(forAll(prop[X3U[Int, String, Boolean], Int, Int] _)) } test("two argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder] (data: Vector[X3[A, B, C]], f1: (A, B) => C): Prop = { val dataset = TypedDataset.create(data) val u1 = udf[X3[A, B, C], A, B, C](f1) val u2 = dataset.makeUDF(f1) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val dataset21 = dataset.select(u1(A, B)).collect().run().toVector val dataset22 = dataset.select(u2(A, B)).collect().run().toVector val d = data.map(x => f1(x.a, x.b)) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int] _)) check(forAll(prop[String, Int, Int] _)) } test("multiple two argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder] (data: Vector[X3[A, B, C]], f1: (A, B) => C, f2: (B, C) => A): Prop = { val dataset = TypedDataset.create(data) val u11 = udf[X3[A, B, C], A, B, C](f1) val u12 = dataset.makeUDF(f1) val u21 = udf[X3[A, B, C], B, C, A](f2) val u22 = dataset.makeUDF(f2) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val dataset21 = dataset.select(u11(A, B), u21(B, C)).collect().run().toVector val dataset22 = dataset.select(u12(A, B), u22(B, C)).collect().run().toVector val d = data.map(x => (f1(x.a, x.b), f2(x.b, x.c))) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int] _)) check(forAll(prop[String, Int, Int] _)) } test("three argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder] (data: Vector[X3[A, B, C]], f: (A, B, C) => C): Prop = { val dataset = TypedDataset.create(data) val u1 = udf[X3[A, B, C], A, B, C, C](f) val u2 = dataset.makeUDF(f) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val dataset21 = dataset.select(u1(A, B, C)).collect().run().toVector val dataset22 = dataset.select(u2(A, B, C)).collect().run().toVector val d = data.map(x => f(x.a, x.b, x.c)) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int] _)) check(forAll(prop[String, Int, Int] _)) } test("four argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder] (data: Vector[X4[A, B, C, D]], f: (A, B, C, D) => C): Prop = { val dataset = TypedDataset.create(data) val u1 = udf[X4[A, B, C, D], A, B, C, D, C](f) val u2 = dataset.makeUDF(f) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val dataset21 = dataset.select(u1(A, B, C, D)).collect().run().toVector val dataset22 = dataset.select(u2(A, B, C, D)).collect().run().toVector val d = data.map(x => f(x.a, x.b, x.c, x.d)) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int, Int] _)) check(forAll(prop[String, Int, Int, String] _)) check(forAll(prop[String, String, String, String] _)) check(forAll(prop[String, Long, String, String] _)) check(forAll(prop[String, Boolean, Boolean, String] _)) } test("five argument udf") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder, E: TypedEncoder] (data: Vector[X5[A, B, C, D, E]], f: (A, B, C, D, E) => C): Prop = { val dataset = TypedDataset.create(data) val u1 = udf[X5[A, B, C, D, E], A, B, C, D, E, C](f) val u2 = dataset.makeUDF(f) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val E = dataset.col[E]('e) val dataset21 = dataset.select(u1(A, B, C, D, E)).collect().run().toVector val dataset22 = dataset.select(u2(A, B, C, D, E)).collect().run().toVector val d = data.map(x => f(x.a, x.b, x.c, x.d, x.e)) (dataset21 ?= d) && (dataset22 ?= d) } check(forAll(prop[Int, Int, Int, Int, Int] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/functions/UnaryFunctionsTest.scala ================================================ package frameless package functions import org.scalacheck.{ Arbitrary, Prop } import org.scalacheck.Prop._ import scala.collection.SeqLike import scala.math.Ordering import scala.reflect.ClassTag class UnaryFunctionsTest extends TypedDatasetSuite { test("size tests") { def prop[F[X] <: Traversable[X] : CatalystSizableCollection, A](xs: List[X1[F[A]]])(implicit arb: Arbitrary[F[A]], enc: TypedEncoder[F[A]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(size(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.size).toVector framelessResults ?= scalaResults } check(forAll(prop[Vector, Long] _)) check(forAll(prop[List, Long] _)) check(forAll(prop[Vector, Char] _)) check(forAll(prop[List, Char] _)) check(forAll(prop[Vector, X2[Int, Option[Long]]] _)) check(forAll(prop[List, X2[Int, Option[Long]]] _)) } test("size on array test") { def prop[A: TypedEncoder: ClassTag](xs: List[X1[Array[A]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(size(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.size).toVector framelessResults ?= scalaResults } check(forAll(prop[Long] _)) check(forAll(prop[String] _)) check(forAll(prop[X2[Int, Option[Long]]] _)) } test("size on Map") { def prop[A](xs: List[X1[Map[A, A]]])(implicit arb: Arbitrary[Map[A, A]], enc: TypedEncoder[Map[A, A]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(size(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.size).toVector framelessResults ?= scalaResults } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[Char] _)) } test("sort in ascending order") { def prop[F[X] <: SeqLike[X, F[X]] : CatalystSortableCollection, A: Ordering](xs: List[X1[F[A]]])(implicit enc: TypedEncoder[F[A]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(sortAscending(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.sorted).toVector framelessResults ?= scalaResults } check(forAll(prop[Vector, Long] _)) check(forAll(prop[Vector, Int] _)) check(forAll(prop[Vector, Char] _)) check(forAll(prop[Vector, String] _)) check(forAll(prop[List, Long] _)) check(forAll(prop[List, Int] _)) check(forAll(prop[List, Char] _)) check(forAll(prop[List, String] _)) } test("sort in descending order") { def prop[F[X] <: SeqLike[X, F[X]] : CatalystSortableCollection, A: Ordering](xs: List[X1[F[A]]])(implicit enc: TypedEncoder[F[A]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(sortDescending(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.sorted.reverse).toVector framelessResults ?= scalaResults } check(forAll(prop[Vector, Long] _)) check(forAll(prop[Vector, Int] _)) check(forAll(prop[Vector, Char] _)) check(forAll(prop[Vector, String] _)) check(forAll(prop[List, Long] _)) check(forAll(prop[List, Int] _)) check(forAll(prop[List, Char] _)) check(forAll(prop[List, String] _)) } test("sort on array test: ascending order") { def prop[A: TypedEncoder : Ordering : ClassTag](xs: List[X1[Array[A]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(sortAscending(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.sorted).toVector Prop { framelessResults .zip(scalaResults) .forall { case (a, b) => a sameElements b } } } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } test("sort on array test: descending order") { def prop[A: TypedEncoder : Ordering : ClassTag](xs: List[X1[Array[A]]]): Prop = { val tds = TypedDataset.create(xs) val framelessResults = tds.select(sortDescending(tds('a))).collect().run().toVector val scalaResults = xs.map(x => x.a.sorted.reverse).toVector Prop { framelessResults .zip(scalaResults) .forall { case (a, b) => a sameElements b } } } check(forAll(prop[Long] _)) check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/ColumnTypesTest.scala ================================================ package frameless package ops import org.scalacheck.Prop import org.scalacheck.Prop.forAll import shapeless.HNil import shapeless.:: class ColumnTypesTest extends TypedDatasetSuite { test("test summoning") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = { val d: TypedDataset[X4[A, B, C, D]] = TypedDataset.create(data) val hlist = d('a) :: d('b) :: d('c) :: d('d) :: HNil type TC[N] = TypedColumn[X4[A,B,C,D], N] type IN = TC[A] :: TC[B] :: TC[C] :: TC[D] :: HNil type OUT = A :: B :: C :: D :: HNil implicitly[ColumnTypes.Aux[X4[A,B,C,D], IN, OUT]] Prop.passed // successful compilation implies test correctness } check(forAll(prop[Int, String, X1[String], Boolean] _)) check(forAll(prop[Vector[Int], Vector[Vector[String]], X1[String], Option[String]] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/CubeTests.scala ================================================ package frameless package ops import frameless.functions.aggregate._ import org.scalacheck.Prop import org.scalacheck.Prop._ class CubeTests extends TypedDatasetSuite { test("cube('a).agg(count())") { def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric] (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.cube(A).agg(count()).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.cube("a").count().collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long] _)) } test("cube('a, 'b).agg(count())") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric] (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val received = dataset.cube(A, B).agg(count()).collect().run().toVector.sortBy(_._3) val expected = dataset.dataset.cube("a", "b").count().collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[Long](2))).sortBy(_._3) received ?= expected } check(forAll(prop[Int, Long, Long] _)) } test("cube('a).agg(sum('b)") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric] (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val received = dataset.cube(A).agg(sum(B)).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.cube("a").sum("b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Out](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long, Long] _)) } test("cube('a).mapGroups('a, sum('b))") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder : Numeric] (data: List[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.cube(A) .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) } .collect().run().toVector.sortBy(_._1) val expected = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1) received ?= expected } check(forAll(prop[Int, Long] _)) } test("cube('a).agg(sum('b), sum('c)) to cube('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder, C: TypedEncoder, OutB: TypedEncoder : Numeric, OutC: TypedEncoder : Numeric ](data: List[X3[A, B, C]])( implicit summableB: CatalystSummable[B, OutB], summableC: CatalystSummable[C, OutC] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumBC = dataset .cube(A) .agg(sum(B), sum(C)) .collect().run().toVector.sortBy(_._1) val sparkSumBC = dataset.dataset.cube("a").sum("b", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2))) .sortBy(_._1) val framelessSumBCB = dataset .cube(A) .agg(sum(B), sum(C), sum(B)) .collect().run().toVector.sortBy(_._1) val sparkSumBCB = dataset.dataset.cube("a").sum("b", "c", "b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3))) .sortBy(_._1) val framelessSumBCBC = dataset .cube(A) .agg(sum(B), sum(C), sum(B), sum(C)) .collect().run().toVector.sortBy(_._1) val sparkSumBCBC = dataset.dataset.cube("a").sum("b", "c", "b", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4))) .sortBy(_._1) val framelessSumBCBCB = dataset .cube(A) .agg(sum(B), sum(C), sum(B), sum(C), sum(B)) .collect().run().toVector.sortBy(_._1) val sparkSumBCBCB = dataset.dataset.cube("a").sum("b", "c", "b", "c", "b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4), row.getAs[OutB](5))) .sortBy(_._1) (framelessSumBC ?= sparkSumBC) .&&(framelessSumBCB ?= sparkSumBCB) .&&(framelessSumBCBC ?= sparkSumBCBC) .&&(framelessSumBCBCB ?= sparkSumBCBCB) } check(forAll(prop[String, Long, Double, Long, Double] _)) } test("cube('a, 'b).agg(sum('c), sum('d))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, D: TypedEncoder, OutC: TypedEncoder : Numeric, OutD: TypedEncoder : Numeric ](data: List[X4[A, B, C, D]])( implicit summableC: CatalystSummable[C, OutC], summableD: CatalystSummable[D, OutD] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val framelessSumByAB = dataset .cube(A, B) .agg(sum(C), sum(D)) .collect().run().toVector.sortBy(x => (x._1, x._2)) val sparkSumByAB = dataset.dataset .cube("a", "b").sum("c", "d").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutD](3))) .sortBy(x => (x._1, x._2)) framelessSumByAB ?= sparkSumByAB } check(forAll(prop[Byte, Int, Long, Double, Long, Double] _)) } test("cube('a, 'b).agg(sum('c)) to cube('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, OutC: TypedEncoder: Numeric ](data: List[X3[A, B, C]])(implicit summableC: CatalystSummable[C, OutC]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumC = dataset .cube(A, B) .agg(sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumC = dataset.dataset .cube("a", "b").sum("c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2))) .sortBy(_._2) val framelessSumCC = dataset .cube(A, B) .agg(sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCC = dataset.dataset .cube("a", "b").sum("c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3))) .sortBy(_._2) val framelessSumCCC = dataset .cube(A, B) .agg(sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCC = dataset.dataset .cube("a", "b").sum("c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4))) .sortBy(_._2) val framelessSumCCCC = dataset .cube(A, B) .agg(sum(C), sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCCC = dataset.dataset .cube("a", "b").sum("c", "c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5))) .sortBy(_._2) val framelessSumCCCCC = dataset .cube(A, B) .agg(sum(C), sum(C), sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCCCC = dataset.dataset .cube("a", "b").sum("c", "c", "c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5), row.getAs[OutC](6))) .sortBy(_._2) (framelessSumC ?= sparkSumC) && (framelessSumCC ?= sparkSumCC) && (framelessSumCCC ?= sparkSumCCC) && (framelessSumCCCC ?= sparkSumCCCC) && (framelessSumCCCCC ?= sparkSumCCCCC) } check(forAll(prop[String, Long, Double, Double] _)) } test("cube('a, 'b).mapGroups('a, 'b, sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Numeric ](data: List[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val framelessSumByAB = dataset .cube(A, B) .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) } .collect().run().toVector.sortBy(x => (x._1, x._2)) val sumByAB = data.groupBy(x => (x.a, x.b)) .mapValues { xs => xs.map(_.c).sum } .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2)) framelessSumByAB ?= sumByAB } check(forAll(prop[Byte, Int, Long] _)) } test("cube('a).mapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder: Ordering, B: TypedEncoder: Ordering, ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .cube(A) .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted)) .collect().run().toMap val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted } datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("cube('a).flatMapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .cube(A) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run() .sorted val dataGrouped = data .groupBy(_.a).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("cube('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Ordering ](data: Vector[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val cA = dataset.col[A]('a) val cB = dataset.col[B]('b) val datasetGrouped = dataset .cube(cA, cB) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run() .sorted val dataGrouped = data .groupBy(t => (t.a, t.b)).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short], Long] _)) check(forAll(prop[Option[Short], Short, Int] _)) check(forAll(prop[X1[Option[Short]], Short, Byte] _)) } test("cubeMany('a).agg(sum('b))") { def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric] (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.cubeMany(A).agg(count[X1[A]]()).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.cube("a").count().collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/PivotTest.scala ================================================ package frameless package ops import frameless.functions.aggregate._ import org.apache.spark.sql.{functions => sparkFunctions} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Prop._ import org.scalacheck.{Gen, Prop} class PivotTest extends TypedDatasetSuite { def withCustomGenX4: Gen[Vector[X4[String, String, Int, Boolean]]] = { val kvPairGen: Gen[X4[String, String, Int, Boolean]] = for { a <- Gen.oneOf(Seq("1", "2", "3", "4")) b <- Gen.oneOf(Seq("a", "b", "c")) c <- arbitrary[Int] d <- arbitrary[Boolean] } yield X4(a, b, c, d) Gen.listOfN(4, kvPairGen).map(_.toVector) } test("X4[Boolean, String, Int, Boolean] pivot on String") { def prop(data: Vector[X4[String, String, Int, Boolean]]): Prop = { val d = TypedDataset.create(data) val frameless = d.groupBy(d('a)). pivot(d('b)).on("a", "b", "c"). agg(sum(d('c)), first(d('d))).collect().run().toVector val spark = d.dataset.groupBy("a") .pivot("b", Seq("a", "b", "c")) .agg(sparkFunctions.sum("c"), sparkFunctions.first("d")).collect().toVector (frameless.map(_._1) ?= spark.map(x => x.getAs[String](0))).&&( frameless.map(_._2) ?= spark.map(x => Option(x.getAs[Long](1)))).&&( frameless.map(_._3) ?= spark.map(x => Option(x.getAs[Boolean](2)))).&&( frameless.map(_._4) ?= spark.map(x => Option(x.getAs[Long](3)))).&&( frameless.map(_._5) ?= spark.map(x => Option(x.getAs[Boolean](4)))).&&( frameless.map(_._6) ?= spark.map(x => Option(x.getAs[Long](5)))).&&( frameless.map(_._7) ?= spark.map(x => Option(x.getAs[Boolean](6)))) } check(forAll(withCustomGenX4)(prop)) } test("Pivot on Boolean") { val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false)) val d = TypedDataset.create(x) d.groupByMany(d('a)). pivot(d('c)).on(true, false). agg(count[X3[String, Boolean, Boolean]]()). collect().run().toVector ?= Vector(("a", Some(2L), Some(1L))) // two true one false } test("Pivot with groupBy on two columns, pivot on Long") { val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20)) val d = TypedDataset.create(x) d.groupBy(d('a), d('b)). pivot(d('c)).on(1L, 20L). agg(count[X3[String, String, Long]]()). collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L))) } test("Pivot with cube on two columns, pivot on Long") { val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20)) val d = TypedDataset.create(x) d.cube(d('a), d('b)) .pivot(d('c)).on(1L, 20L) .agg(count[X3[String, String, Long]]()) .collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L))) } test("Pivot with cube on Boolean") { val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false)) val d = TypedDataset.create(x) d.cube(d('a)). pivot(d('c)).on(true, false). agg(count[X3[String, Boolean, Boolean]]()). collect().run().toVector ?= Vector(("a", Some(2L), Some(1L))) } test("Pivot with rollup on two columns, pivot on Long") { val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20)) val d = TypedDataset.create(x) d.rollup(d('a), d('b)) .pivot(d('c)).on(1L, 20L) .agg(count[X3[String, String, Long]]()) .collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L))) } test("Pivot with rollup on Boolean") { val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false)) val d = TypedDataset.create(x) d.rollupMany(d('a)). pivot(d('c)).on(true, false). agg(count[X3[String, Boolean, Boolean]]()). collect().run().toVector ?= Vector(("a", Some(2L), Some(1L))) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/RepeatTest.scala ================================================ package frameless package ops import shapeless.test.illTyped import shapeless.{::, HNil, Nat} class RepeatTest extends TypedDatasetSuite { test("summoning with implicitly") { implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._1, Int::Boolean::HNil]] implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._2, Int::Boolean::Int::Boolean::HNil]] implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._3, Int::Boolean::Int::Boolean::Int::Boolean::HNil]] implicitly[Repeat.Aux[String::HNil, Nat._5, String::String::String::String::String::HNil]] } test("ill typed") { illTyped("""implicitly[Repeat.Aux[String::HNil, Nat._5, String::String::String::String::HNil]]""") } } ================================================ FILE: dataset/src/test/scala/frameless/ops/RollupTests.scala ================================================ package frameless package ops import frameless.functions.aggregate._ import org.scalacheck.Prop import org.scalacheck.Prop._ class RollupTests extends TypedDatasetSuite { test("rollup('a).agg(count())") { def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric] (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.rollup(A).agg(count()).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.rollup("a").count().collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long] _)) } test("rollup('a, 'b).agg(count())") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric] (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val received = dataset.rollup(A, B).agg(count()).collect().run().toVector.sortBy(_._3) val expected = dataset.dataset.rollup("a", "b").count().collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[Long](2))).sortBy(_._3) received ?= expected } check(forAll(prop[Int, Long, Long] _)) } test("rollup('a).agg(sum('b)") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric] (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val received = dataset.rollup(A).agg(sum(B)).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.rollup("a").sum("b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Out](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long, Long] _)) } test("rollup('a).mapGroups('a, sum('b))") { def prop[A: TypedEncoder : Ordering, B: TypedEncoder : Numeric] (data: List[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.rollup(A) .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) } .collect().run().toVector.sortBy(_._1) val expected = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1) received ?= expected } check(forAll(prop[Int, Long] _)) } test("rollup('a).agg(sum('b), sum('c)) to rollup('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder, C: TypedEncoder, OutB: TypedEncoder : Numeric, OutC: TypedEncoder : Numeric ](data: List[X3[A, B, C]])( implicit summableB: CatalystSummable[B, OutB], summableC: CatalystSummable[C, OutC] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumBC = dataset .rollup(A) .agg(sum(B), sum(C)) .collect().run().toVector.sortBy(_._1) val sparkSumBC = dataset.dataset.rollup("a").sum("b", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2))) .sortBy(_._1) val framelessSumBCB = dataset .rollup(A) .agg(sum(B), sum(C), sum(B)) .collect().run().toVector.sortBy(_._1) val sparkSumBCB = dataset.dataset.rollup("a").sum("b", "c", "b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3))) .sortBy(_._1) val framelessSumBCBC = dataset .rollup(A) .agg(sum(B), sum(C), sum(B), sum(C)) .collect().run().toVector.sortBy(_._1) val sparkSumBCBC = dataset.dataset.rollup("a").sum("b", "c", "b", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4))) .sortBy(_._1) val framelessSumBCBCB = dataset .rollup(A) .agg(sum(B), sum(C), sum(B), sum(C), sum(B)) .collect().run().toVector.sortBy(_._1) val sparkSumBCBCB = dataset.dataset.rollup("a").sum("b", "c", "b", "c", "b").collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4), row.getAs[OutB](5))) .sortBy(_._1) (framelessSumBC ?= sparkSumBC) .&&(framelessSumBCB ?= sparkSumBCB) .&&(framelessSumBCBC ?= sparkSumBCBC) .&&(framelessSumBCBCB ?= sparkSumBCBCB) } check(forAll(prop[String, Long, Double, Long, Double] _)) } test("rollup('a, 'b).agg(sum('c), sum('d))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, D: TypedEncoder, OutC: TypedEncoder : Numeric, OutD: TypedEncoder : Numeric ](data: List[X4[A, B, C, D]])( implicit summableC: CatalystSummable[C, OutC], summableD: CatalystSummable[D, OutD] ): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val D = dataset.col[D]('d) val framelessSumByAB = dataset .rollup(A, B) .agg(sum(C), sum(D)) .collect().run().toVector.sortBy(_._2) val sparkSumByAB = dataset.dataset .rollup("a", "b").sum("c", "d").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutD](3))) .sortBy(_._2) framelessSumByAB ?= sparkSumByAB } check(forAll(prop[Byte, Int, Long, Double, Long, Double] _)) } test("rollup('a, 'b).agg(sum('c)) to rollup('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder, OutC: TypedEncoder: Numeric ](data: List[X3[A, B, C]])(implicit summableC: CatalystSummable[C, OutC]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val C = dataset.col[C]('c) val framelessSumC = dataset .rollup(A, B) .agg(sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumC = dataset.dataset .rollup("a", "b").sum("c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2))) .sortBy(_._2) val framelessSumCC = dataset .rollup(A, B) .agg(sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCC = dataset.dataset .rollup("a", "b").sum("c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3))) .sortBy(_._2) val framelessSumCCC = dataset .rollup(A, B) .agg(sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCC = dataset.dataset .rollup("a", "b").sum("c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4))) .sortBy(_._2) val framelessSumCCCC = dataset .rollup(A, B) .agg(sum(C), sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCCC = dataset.dataset .rollup("a", "b").sum("c", "c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5))) .sortBy(_._2) val framelessSumCCCCC = dataset .rollup(A, B) .agg(sum(C), sum(C), sum(C), sum(C), sum(C)) .collect().run().toVector .sortBy(_._2) val sparkSumCCCCC = dataset.dataset .rollup("a", "b").sum("c", "c", "c", "c", "c").collect().toVector .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5), row.getAs[OutC](6))) .sortBy(_._2) (framelessSumC ?= sparkSumC) && (framelessSumCC ?= sparkSumCC) && (framelessSumCCC ?= sparkSumCCC) && (framelessSumCCCC ?= sparkSumCCCC) && (framelessSumCCCCC ?= sparkSumCCCCC) } check(forAll(prop[String, Long, Double, Double] _)) } test("rollup('a, 'b).mapGroups('a, 'b, sum('c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Numeric ](data: List[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val B = dataset.col[B]('b) val framelessSumByAB = dataset .rollup(A, B) .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) } .collect().run().toVector.sortBy(x => (x._1, x._2)) val sumByAB = data.groupBy(x => (x.a, x.b)) .mapValues { xs => xs.map(_.c).sum } .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2)) framelessSumByAB ?= sumByAB } check(forAll(prop[Byte, Int, Long] _)) } test("rollup('a).mapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder: Ordering, B: TypedEncoder: Ordering ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .rollup(A) .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted)) .collect().run().toMap val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted } datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("rollup('a).flatMapGroups(('a, toVector(('a, 'b))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering ](data: Vector[X2[A, B]]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val datasetGrouped = dataset .rollup(A) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run() .sorted val dataGrouped = data .groupBy(_.a).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short]] _)) check(forAll(prop[Option[Short], Short] _)) check(forAll(prop[X1[Option[Short]], Short] _)) } test("rollup('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") { def prop[ A: TypedEncoder : Ordering, B: TypedEncoder : Ordering, C: TypedEncoder : Ordering ](data: Vector[X3[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) val cA = dataset.col[A]('a) val cB = dataset.col[B]('b) val datasetGrouped = dataset .rollup(cA, cB) .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x))) .collect().run() .sorted val dataGrouped = data .groupBy(t => (t.a, t.b)).toSeq .flatMap { case (a, xs) => xs.map(x => (a, x)) } .sorted datasetGrouped ?= dataGrouped } check(forAll(prop[Short, Option[Short], Long] _)) check(forAll(prop[Option[Short], Short, Int] _)) check(forAll(prop[X1[Option[Short]], Short, Byte] _)) } test("rollupMany('a).agg(sum('b))") { def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric] (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = { val dataset = TypedDataset.create(data) val A = dataset.col[A]('a) val received = dataset.rollupMany(A).agg(count[X1[A]]()).collect().run().toVector.sortBy(_._2) val expected = dataset.dataset.rollup("a").count().collect().toVector .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2) received ?= expected } check(forAll(prop[Int, Long] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/SmartProjectTest.scala ================================================ package frameless package ops import org.scalacheck.Prop import org.scalacheck.Prop._ import shapeless.test.illTyped case class Foo(i: Int, j: Int, x: String) case class Bar(i: Int, x: String) case class InvalidFooProjectionType(i: Int, x: Boolean) case class InvalidFooProjectionName(i: Int, xerr: String) class SmartProjectTest extends TypedDatasetSuite { // Lazy needed to prevent initialization anterior to the `beforeAll` hook lazy val dataset = TypedDataset.create(Foo(1, 2, "hi") :: Foo(2, 3, "there") :: Nil) test("project Foo to Bar") { assert(dataset.project[Bar].count().run() === 2) } test("project to InvalidFooProjection should not type check") { illTyped("dataset.project[InvalidFooProjectionType]") illTyped("dataset.project[InvalidFooProjectionName]") } test("X4 to X1,X2,X3,X4 projections") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = { val dataset = TypedDataset.create(data) dataset.project[X4[A, B, C, D]].collect().run().toVector ?= data dataset.project[X3[A, B, C]].collect().run().toVector ?= data.map(x => X3(x.a, x.b, x.c)) dataset.project[X2[A, B]].collect().run().toVector ?= data.map(x => X2(x.a, x.b)) dataset.project[X1[A]].collect().run().toVector ?= data.map(x => X1(x.a)) } check(forAll(prop[Int, String, X1[String], Boolean] _)) check(forAll(prop[Short, Long, String, Boolean] _)) check(forAll(prop[Short, (Boolean, Boolean), String, (Int, Int)] _)) check(forAll(prop[X2[String, Boolean], (Boolean, Boolean), String, Boolean] _)) check(forAll(prop[X2[String, Boolean], X3[Boolean, Boolean, Long], String, String] _)) } test("X3U to X1,X2,X3 projections") { def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](data: Vector[X3U[A, B, C]]): Prop = { val dataset = TypedDataset.create(data) dataset.project[X3[A, B, C]].collect().run().toVector ?= data.map(x => X3(x.a, x.b, x.c)) dataset.project[X2[A, B]].collect().run().toVector ?= data.map(x => X2(x.a, x.b)) dataset.project[X1[A]].collect().run().toVector ?= data.map(x => X1(x.a)) } check(forAll(prop[Int, String, X1[String]] _)) check(forAll(prop[Short, Long, String] _)) check(forAll(prop[Short, (Boolean, Boolean), String] _)) check(forAll(prop[X2[String, Boolean], (Boolean, Boolean), String] _)) check(forAll(prop[X2[String, Boolean], X3[Boolean, Boolean, Long], String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/deserialized/FilterTests.scala ================================================ package frameless package ops package deserialized import org.scalacheck.Prop import org.scalacheck.Prop._ class FilterTests extends TypedDatasetSuite { test("filter") { def prop[A: TypedEncoder](filterFunction: A => Boolean, data: Vector[A]): Prop = TypedDataset.create(data). deserialized. filter(filterFunction). collect().run().toVector =? data.filter(filterFunction) check(forAll(prop[Int] _)) check(forAll(prop[String] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/deserialized/FlatMapTests.scala ================================================ package frameless package ops package deserialized import org.scalacheck.Prop import org.scalacheck.Prop._ class FlatMapTests extends TypedDatasetSuite { test("flatMap") { def prop[A: TypedEncoder, B: TypedEncoder](flatMapFunction: A => Vector[B], data: Vector[A]): Prop = TypedDataset.create(data). deserialized. flatMap(flatMapFunction). collect().run().toVector =? data.flatMap(flatMapFunction) check(forAll(prop[Int, Int] _)) check(forAll(prop[Int, String] _)) check(forAll(prop[String, Int] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/deserialized/MapPartitionsTests.scala ================================================ package frameless package ops package deserialized import org.scalacheck.Prop import org.scalacheck.Prop._ class MapPartitionsTests extends TypedDatasetSuite { test("mapPartitions") { def prop[A: TypedEncoder, B: TypedEncoder](mapFunction: A => B, data: Vector[A]): Prop = { val lifted: Iterator[A] => Iterator[B] = _.map(mapFunction) TypedDataset.create(data). deserialized. mapPartitions(lifted). collect().run().toVector =? data.map(mapFunction) } check(forAll(prop[Int, Int] _)) check(forAll(prop[Int, String] _)) check(forAll(prop[String, Int] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/deserialized/MapTests.scala ================================================ package frameless package ops package deserialized import org.scalacheck.Prop import org.scalacheck.Prop._ class MapTests extends TypedDatasetSuite { test("map") { def prop[A: TypedEncoder, B: TypedEncoder](mapFunction: A => B, data: Vector[A]): Prop = TypedDataset.create(data). deserialized. map(mapFunction). collect().run().toVector =? data.map(mapFunction) check(forAll(prop[Int, Int] _)) check(forAll(prop[Int, String] _)) check(forAll(prop[String, Int] _)) check(forAll(prop[X1[Int], X1[Int]] _)) } } ================================================ FILE: dataset/src/test/scala/frameless/ops/deserialized/ReduceTests.scala ================================================ package frameless package ops package deserialized import org.scalacheck.Prop import org.scalacheck.Prop._ class ReduceTests extends TypedDatasetSuite { def prop[A: TypedEncoder](reduceFunction: (A, A) => A)(data: Vector[A]): Prop = TypedDataset.create(data). deserialized. reduceOption(reduceFunction).run() =? data.reduceOption(reduceFunction) test("reduce Int") { check(forAll(prop[Int](_ + _) _)) check(forAll(prop[Int](_ * _) _)) } test("reduce String") { def reduce(s1: String, s2: String): String = (s1 ++ s2).sorted check(forAll(prop[String](reduce) _)) } } ================================================ FILE: dataset/src/test/scala/frameless/package.scala ================================================ import java.time.format.DateTimeFormatter import java.time.{LocalDateTime => JavaLocalDateTime} import org.scalacheck.{Arbitrary, Gen} package object frameless { /** Fixed decimal point to avoid precision problems specific to Spark */ implicit val arbBigDecimal: Arbitrary[BigDecimal] = Arbitrary { for { x <- Gen.chooseNum(-1000, 1000) y <- Gen.chooseNum(0, 1000000) } yield BigDecimal(s"$x.$y") } /** Fixed decimal point to avoid precision problems specific to Spark */ implicit val arbDouble: Arbitrary[Double] = Arbitrary { arbBigDecimal.arbitrary.map(_.toDouble) } implicit val arbSqlDate = Arbitrary { Arbitrary.arbitrary[Int].map(SQLDate) } implicit val arbSqlTimestamp = Arbitrary { Arbitrary.arbitrary[Long].map(SQLTimestamp) } implicit def arbTuple1[A: Arbitrary] = Arbitrary { Arbitrary.arbitrary[A].map(Tuple1(_)) } // see issue with scalacheck non serializable Vector: https://github.com/rickynils/scalacheck/issues/315 implicit def arbVector[A](implicit A: Arbitrary[A]): Arbitrary[Vector[A]] = Arbitrary(Gen.listOf(A.arbitrary).map(_.toVector)) def vectorGen[A: Arbitrary]: Gen[Vector[A]] = arbVector[A].arbitrary implicit val arbUdtEncodedClass: Arbitrary[UdtEncodedClass] = Arbitrary { for { int <- Arbitrary.arbitrary[Int] doubles <- Gen.listOf(arbDouble.arbitrary) } yield new UdtEncodedClass(int, doubles.toArray) } val dateTimeFormatter: DateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm") implicit val localDateArb: Arbitrary[JavaLocalDateTime] = Arbitrary { for { year <- Gen.chooseNum(1900, 2027) month <- Gen.chooseNum(1, 12) dayOfMonth <- Gen.chooseNum(1, 28) hour <- Gen.chooseNum(1, 23) minute <- Gen.chooseNum(1, 59) } yield JavaLocalDateTime.of(year, month, dayOfMonth, hour, minute) } /** LocalDateTime String Generator to test time related Spark functions */ val dateTimeStringGen: Gen[List[String]] = for { listOfDates <- Gen.listOf(localDateArb.arbitrary) localDate <- listOfDates } yield localDate.format(dateTimeFormatter) val TEST_OUTPUT_DIR = "target/test-output" /** * Will dive down causes until either the cause is true or there are no more causes * @param t * @param f * @return */ def anyCauseHas(t: Throwable, f: Throwable => Boolean): Boolean = if (f(t)) true else if (t.getCause ne null) anyCauseHas(t.getCause, f) else false /** * Runs up to maxRuns and outputs the number of failures (times thrown) * @param maxRuns * @param thunk * @tparam T * @return the last passing thunk, or null */ def runLoads[T](maxRuns: Int = 1000)(thunk: => T): T ={ var i = 0 var r = null.asInstanceOf[T] var passed = 0 while(i < maxRuns){ i += 1 try { r = thunk passed += 1 if (i % 20 == 0) { println(s"run $i successful") } } catch { case t: Throwable => System.err.println(s"failed unexpectedly on run $i - ${t.getMessage}") } } if (passed != maxRuns) { System.err.println(s"had ${maxRuns - passed} failures out of $maxRuns runs") } r } /** * Runs a given thunk up to maxRuns times, restarting the thunk if tolerantOf the thrown Throwable is true * @param tolerantOf * @param maxRuns default of 20 * @param thunk * @return either a successful run result or the last error will be thrown */ def tolerantRun[T](tolerantOf: Throwable => Boolean, maxRuns: Int = 20)(thunk: => T): T ={ var passed = false var i = 0 var res: T = null.asInstanceOf[T] var thrown: Throwable = null while((i < maxRuns) && !passed) { try { i += 1 res = thunk passed = true } catch { case t: Throwable if anyCauseHas(t, tolerantOf) => // rinse and repeat thrown = t case t: Throwable => throw t } } if (!passed) { System.err.println(s"Despite being tolerant each of the $maxRuns runs failed, re-throwing the last") throw thrown } res } } ================================================ FILE: dataset/src/test/scala/frameless/sql/package.scala ================================================ package frameless import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.{And, Or} package object sql { implicit class ExpressionOps(val self: Expression) extends AnyVal { def toList: List[Expression] = { def rec(expr: Expression, acc: List[Expression]): List[Expression] = { expr match { case And(left, right) => rec(left, rec(right, acc)) case Or(left, right) => rec(left, rec(right, acc)) case e => e +: acc } } rec(self, Nil) } } } ================================================ FILE: dataset/src/test/scala/frameless/sql/rules/SQLRulesSuite.scala ================================================ package frameless.sql.rules import frameless._ import frameless.sql._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.scalatest.Assertion import org.scalatest.matchers.should.Matchers trait SQLRulesSuite extends TypedDatasetSuite with Matchers { self => protected lazy val path: String = { val tmpDir = System.getProperty("java.io.tmpdir") s"$tmpDir/${self.getClass.getName}" } def withDataset[A: TypedEncoder: CatalystOrdered](payload: A)(f: TypedDataset[A] => Assertion): Assertion = { TypedDataset.create(Seq(payload)).write.mode("overwrite").parquet(path) f(TypedDataset.createUnsafe[A](session.read.parquet(path))) } def predicatePushDownTest[A: TypedEncoder: CatalystOrdered]( expected: X1[A], expectedPushDownFilters: List[Filter], planShouldNotContain: PartialFunction[Expression, Expression], op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean] ): Assertion = { withDataset(expected) { dataset => val ds = dataset.filter(op(dataset('a))) val actualPushDownFilters = pushDownFilters(ds) val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList) // check the optimized plan optimizedPlan.collectFirst(planShouldNotContain) should be (empty) // compare filters actualPushDownFilters shouldBe expectedPushDownFilters val actual = ds.collect().run().toVector.headOption // ensure serialization is not broken actual should be(Some(expected)) } } protected def pushDownFilters[T](ds: TypedDataset[T]): List[Filter] = { val sparkPlan = ds.queryExecution.executedPlan val initialPlan = if (sparkPlan.children.isEmpty) // assume it's AQE sparkPlan match { case aq: AdaptiveSparkPlanExec => aq.initialPlan case _ => sparkPlan } else sparkPlan initialPlan.collect { case fs: FileSourceScanExec => import scala.reflect.runtime.{universe => ru} val runtimeMirror = ru.runtimeMirror(getClass.getClassLoader) val instanceMirror = runtimeMirror.reflect(fs) val getter = ru.typeOf[FileSourceScanExec].member(ru.TermName("pushedDownFilters")).asTerm.getter val m = instanceMirror.reflectMethod(getter.asMethod) val res = m.apply(fs).asInstanceOf[Seq[Filter]] res }.flatten.toList } } ================================================ FILE: dataset/src/test/scala/frameless/syntax/FramelessSyntaxTests.scala ================================================ package frameless package syntax import org.scalacheck.Prop import org.scalacheck.Prop._ import frameless.functions.aggregate._ class FramelessSyntaxTests extends TypedDatasetSuite { // Hide the implicit SparkDelay[Job] on TypedDatasetSuite to avoid ambiguous implicits override val sparkDelay = null def prop[A, B](data: Vector[X2[A, B]])( implicit ev: TypedEncoder[X2[A, B]] ): Prop = { val dataset = TypedDataset.create(data).dataset val dataframe = dataset.toDF() val typedDataset = dataset.typed val typedDatasetFromDataFrame = dataframe.unsafeTyped[X2[A, B]] typedDataset.collect().run().toVector ?= typedDatasetFromDataFrame.collect().run().toVector } test("dataset typed - toTyped") { def prop[A, B](data: Vector[X2[A, B]])( implicit ev: TypedEncoder[X2[A, B]] ): Prop = { val dataset = session.createDataset(data)(TypedExpressionEncoder(ev)).typed val dataframe = dataset.toDF() dataset.collect().run().toVector ?= dataframe.unsafeTyped[X2[A, B]].collect().run().toVector } check(forAll(prop[Int, String] _)) check(forAll(prop[X1[Long], String] _)) } test("frameless typed column and aggregate") { def prop[A: TypedEncoder](a: A, b: A): Prop = { val d = TypedDataset.create((a, b) :: Nil) (d.select(d('_1).untyped.typedColumn).collect().run ?= d.select(d('_1)).collect().run).&&( d.agg(first(d('_1))).collect().run() ?= d.agg(first(d('_1)).untyped.typedAggregate).collect().run() ) } check(forAll(prop[Int] _)) check(forAll(prop[X1[Long]] _)) } } ================================================ FILE: dataset/src/test/scala/org/apache/hadoop/fs/local/StreamingFS.scala ================================================ package org.apache.hadoop.fs.local import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem import org.apache.hadoop.fs.DelegateToFileSystem class StreamingFS(uri: java.net.URI, conf: org.apache.hadoop.conf.Configuration) extends DelegateToFileSystem(uri, new BareLocalFileSystem(), conf, "file", false) {} ================================================ FILE: dataset/src/test/spark-3.2/frameless/sql/rules/FramelessLitPushDownTests.scala ================================================ package frameless.sql.rules import frameless._ import frameless.sql._ import frameless.functions.Lit import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant} import org.apache.spark.sql.sources.{Filter, IsNotNull} import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericRowWithSchema} import java.time.Instant import org.apache.spark.sql.catalyst.plans.logical import org.scalatest.Assertion //Note as InvokeLike and "ConditionalExpression" don't have SPARK-40380 and SPARK-39106 no predicate pushdowns can happen in 3.2.4 class FramelessLitPushDownTests extends SQLRulesSuite { private val now: Long = currentTimestamp() test("java.sql.Timestamp push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(SQLTimestamp(now)) val expectedPushDownFilters = List(IsNotNull("a")) predicatePushDownTest[SQLTimestamp]( expectedStructure, expectedPushDownFilters, { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, _ >= expectedStructure.a ) } test("java.time.Instant push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(microsToInstant(now)) val expectedPushDownFilters = List(IsNotNull("a")) predicatePushDownTest[Instant]( expectedStructure, expectedPushDownFilters, { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, _ >= expectedStructure.a ) } test("struct push-down") { type Payload = X4[Int, Int, Int, Int] val expectedStructure = X1(X4(1, 2, 3, 4)) val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema) val expectedPushDownFilters = List(IsNotNull("a")) predicatePushDownTest[Payload]( expectedStructure, expectedPushDownFilters, // Cast not Lit because of SPARK-40380 { case e @ expressions.EqualTo(_, _: Cast) => e }, _ === expectedStructure.a ) } override def predicatePushDownTest[A: TypedEncoder: CatalystOrdered]( expected: X1[A], expectedPushDownFilters: List[Filter], planShouldContain: PartialFunction[Expression, Expression], op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean] ): Assertion = { withDataset(expected) { dataset => val ds = dataset.filter(op(dataset('a))) val actualPushDownFilters = pushDownFilters(ds) val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList) // check the optimized plan optimizedPlan.collectFirst(planShouldContain) should not be (empty) // compare filters actualPushDownFilters shouldBe expectedPushDownFilters val actual = ds.collect().run().toVector.headOption // ensure serialization is not broken actual should be(Some(expected)) } } } ================================================ FILE: dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala ================================================ package frameless.sql.rules import frameless._ import frameless.functions.Lit import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant} import org.apache.spark.sql.sources.{EqualTo, GreaterThanOrEqual, IsNotNull} import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import java.time.Instant class FramelessLitPushDownTests extends SQLRulesSuite { private val now: Long = currentTimestamp() test("java.sql.Timestamp push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(SQLTimestamp(now)) val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) predicatePushDownTest[SQLTimestamp]( expectedStructure, expectedPushDownFilters, { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, _ >= expectedStructure.a ) } test("java.time.Instant push-down") { val expected = java.sql.Timestamp.from(microsToInstant(now)) val expectedStructure = X1(microsToInstant(now)) val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected)) predicatePushDownTest[Instant]( expectedStructure, expectedPushDownFilters, { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, _ >= expectedStructure.a ) } test("struct push-down") { type Payload = X4[Int, Int, Int, Int] val expectedStructure = X1(X4(1, 2, 3, 4)) val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema) val expectedPushDownFilters = List(IsNotNull("a"), EqualTo("a", expected)) predicatePushDownTest[Payload]( expectedStructure, expectedPushDownFilters, { case e @ expressions.EqualTo(_, _: Lit[_]) => e }, _ === expectedStructure.a ) } } ================================================ FILE: docs/Cats.md ================================================ # Using Cats with Frameless ```scala mdoc:invisible import org.apache.spark.{SparkConf, SparkContext => SC} import org.apache.spark.sql.SparkSession import org.apache.spark.rdd.RDD val conf: SparkConf = new SparkConf().setMaster("local[4]").setAppName("cats.bec test") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() val sc: SC = spark.sparkContext spark.sparkContext.setLogLevel("WARN") System.clearProperty("spark.master.port") System.clearProperty("spark.driver.port") System.clearProperty("spark.hostPort") System.setProperty("spark.cleaner.ttl", "300") import spark.implicits._ import cats.syntax.all._ import cats.effect.{IO, Sync} import cats.data.ReaderT ``` There are two main parts to the `cats` integration offered by Frameless: - effect suspension in `TypedDataset` using `cats-effect` and `cats-mtl` - `RDD` enhancements using algebraic typeclasses in `cats-kernel` All the examples below assume you have previously imported `cats.implicits` and `frameless.cats.implicits`. *Note that you should not import `frameless.syntax._` together with `frameless.cats.implicits._`.* ```scala mdoc import cats.syntax.all._ import frameless.cats.implicits._ ``` ## Effect Suspension in typed datasets As noted in the section about `Job`, all operations on `TypedDataset` are lazy. The results of operations that would normally block on plain Spark APIs are wrapped in a type constructor `F[_]`, for which there exists an instance of `SparkDelay[F]`. This typeclass represents the operation of delaying a computation and capturing an implicit `SparkSession`. In the `cats` module, we utilize the typeclasses from `cats-effect` for abstracting over these effect types - namely, we provide an implicit `SparkDelay` instance for all `F[_]` for which exists an instance of `cats.effect.Sync[F]`. This allows one to run operations on `TypedDataset` in an existing monad stack. For example, given this pre-existing monad stack: ```scala mdoc import frameless.TypedDataset import cats.data.ReaderT import cats.effect.IO import cats.effect.implicits._ type Action[T] = ReaderT[IO, SparkSession, T] ``` We will be able to request that values from `TypedDataset` will be suspended in this stack: ```scala mdoc val typedDs = TypedDataset.create(Seq((1, "string"), (2, "another"))) val result: Action[(Seq[(Int, String)], Long)] = for { sample <- typedDs.take[Action](1) count <- typedDs.count[Action]() } yield (sample, count) ``` As with `Job`, note that nothing has been run yet. The effect has been properly suspended. To run our program, we must first supply the `SparkSession` to the `ReaderT` layer and then run the `IO` effect: ```scala mdoc import cats.effect.unsafe.implicits.global result.run(spark).unsafeRunSync() ``` ### Convenience methods for modifying Spark thread-local variables The `frameless.cats.implicits._` import also provides some syntax enrichments for any monad stack that has the same capabilities as `Action` above. Namely, the ability to provide an instance of `SparkSession` and the ability to suspend effects. For these to work, we will need to import the implicit machinery from the `cats-mtl` library: ```scala mdoc import cats.mtl.implicits._ ``` And now, we can set the description for the computation being run: ```scala mdoc val resultWithDescription: Action[(Seq[(Int, String)], Long)] = for { r <- result.withDescription("fancy cats") session <- ReaderT.ask[IO, SparkSession] _ <- ReaderT.liftF { IO { println(s"Description: ${session.sparkContext.getLocalProperty("spark.job.description")}") } } } yield r resultWithDescription.run(spark).unsafeRunSync() ``` ## Using algebraic typeclasses from Cats with RDDs Data aggregation is one of the most important operations when working with Spark (and data in general). For example, we often have to compute the `min`, `max`, `avg`, etc. from a set of columns grouped by different predicates. This section shows how **cats** simplifies these tasks in Spark by leveraging a large collection of Type Classes for ordering and aggregating data. Cats offers ways to sort and aggregate tuples of arbitrary arity. ```scala mdoc import frameless.cats.implicits._ val data: RDD[(Int, Int, Int)] = sc.makeRDD((1, 2, 3) :: (1, 5, 3) :: (8, 2, 3) :: Nil) println(data.csum) println(data.cmax) println(data.cmin) ``` In case the RDD is empty, the `csum`, `cmax` and `cmin` will use the default values for the type of elements inside the RDD. There are counterpart operations to those that have an `Option` return type to deal with the case of an empty RDD: ```scala mdoc:nest val data: RDD[(Int, Int, Int)] = sc.emptyRDD println(data.csum) println(data.csumOption) println(data.cmax) println(data.cmaxOption) println(data.cmin) println(data.cminOption) ``` The following example aggregates all the elements with a common key. ```scala mdoc type User = String type TransactionCount = Int val allData: RDD[(User,TransactionCount)] = sc.makeRDD(("Bob", 12) :: ("Joe", 1) :: ("Anna", 100) :: ("Bob", 20) :: ("Joe", 2) :: Nil) val totalPerUser = allData.csumByKey totalPerUser.collectAsMap ``` The same example would work for more complex keys. ```scala mdoc import scala.collection.immutable.SortedMap val allDataComplexKeu = sc.makeRDD( ("Bob", SortedMap("task1" -> 10)) :: ("Joe", SortedMap("task1" -> 1, "task2" -> 3)) :: ("Bob", SortedMap("task1" -> 10, "task2" -> 1)) :: ("Joe", SortedMap("task3" -> 4)) :: Nil ) val overalTasksPerUser = allDataComplexKeu.csumByKey overalTasksPerUser.collectAsMap ``` #### Joins ```scala mdoc // Type aliases for meaningful types type TimeSeries = Map[Int,Int] type UserName = String ``` Example: Using the implicit full-outer-join operator ```scala mdoc import frameless.cats.outer._ val day1: RDD[(UserName,TimeSeries)] = sc.makeRDD( ("John", Map(0 -> 2, 1 -> 4)) :: ("Chris", Map(0 -> 1, 1 -> 2)) :: ("Sam", Map(0 -> 1)) :: Nil ) val day2: RDD[(UserName,TimeSeries)] = sc.makeRDD( ("John", Map(0 -> 10, 1 -> 11)) :: ("Chris", Map(0 -> 1, 1 -> 2)) :: ("Joe", Map(0 -> 1, 1 -> 2)) :: Nil ) val daysCombined = day1 |+| day2 daysCombined.collect() ``` Note how the user's timeseries from different days have been aggregated together. The `|+|` (Semigroup) operator for key-value pair RDD will execute a full-outer-join on the key and combine values using the default Semigroup for the value type. In `cats`: ```scala mdoc Map(1 -> 2, 2 -> 3) |+| Map(1 -> 4, 2 -> -1) ``` ```scala mdoc:invisible spark.stop() ``` ================================================ FILE: docs/FeatureOverview.md ================================================ # TypedDataset: Feature Overview This tutorial introduces `TypedDataset` using a simple example. The following imports are needed to make all code examples compile. ```scala mdoc:silent:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import frameless.functions.aggregate._ import frameless.TypedDataset val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ ``` ## Creating TypedDataset instances We start by defining a case class: ```scala mdoc:silent case class Apartment(city: String, surface: Int, price: Double, bedrooms: Int) ``` And few `Apartment` instances: ```scala mdoc:silent val apartments = Seq( Apartment("Paris", 50, 300000.0, 2), Apartment("Paris", 100, 450000.0, 3), Apartment("Paris", 25, 250000.0, 1), Apartment("Lyon", 83, 200000.0, 2), Apartment("Lyon", 45, 133000.0, 1), Apartment("Nice", 74, 325000.0, 3) ) ``` We are now ready to instantiate a `TypedDataset[Apartment]`: ```scala mdoc val aptTypedDs = TypedDataset.create(apartments) ``` We can also create one from an existing Spark `Dataset`: ```scala mdoc:nest val aptDs = spark.createDataset(apartments) val aptTypedDs = TypedDataset.create(aptDs) ``` Or use the Frameless syntax: ```scala mdoc import frameless.syntax._ val aptTypedDs2 = aptDs.typed ``` ## Typesafe column referencing This is how we select a particular column from a `TypedDataset`: ```scala mdoc val cities: TypedDataset[String] = aptTypedDs.select(aptTypedDs('city)) ``` This is completely type-safe, for instance suppose we misspell `city` as `citi`: ```scala mdoc:fail aptTypedDs.select(aptTypedDs('citi)) ``` This gets raised at compile time, whereas with the standard `Dataset` API the error appears at runtime (enjoy the stack trace): ```scala mdoc:crash aptDs.select('citi) ``` `select()` supports arbitrary column operations: ```scala mdoc aptTypedDs.select(aptTypedDs('surface) * 10, aptTypedDs('surface) + 2).show().run() ``` Note that unlike the standard Spark API, where some operations are lazy and some are not, **all TypedDatasets operations are lazy.** In the above example, `show()` is lazy. It requires to apply `run()` for the `show` job to materialize. A more detailed explanation of `Job` is given [here](Job.md). Next we compute the price by surface unit: ```scala mdoc:fail val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface)) ``` As the error suggests, we can't divide a `TypedColumn` of `Double` by `Int.` For safety, in Frameless only math operations between same types is allowed: ```scala mdoc val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface).cast[Double]) priceBySurfaceUnit.collect().run() ``` Looks like it worked, but that `cast` seems unsafe right? Actually it is safe. Let's try to cast a `TypedColumn` of `String` to `Double`: ```scala mdoc:fail aptTypedDs('city).cast[Double] ``` The compile-time error tells us that to perform the cast, an evidence (in the form of `CatalystCast[String, Double]`) must be available. Since casting from `String` to `Double` is not allowed, this results in a compilation error. Check [here](https://github.com/typelevel/frameless/blob/master/core/src/main/scala/frameless/CatalystCast.scala) for the set of available `CatalystCast.` ## Working with Optional columns When working with real data we have to deal with imperfections, such as missing fields. Columns that may have missing data should be represented using `Options`. For this example, let's assume that the Apartments dataset may have missing values. ```scala mdoc:silent case class ApartmentOpt(city: Option[String], surface: Option[Int], price: Option[Double], bedrooms: Option[Int]) ``` ```scala mdoc:silent val apartmentsOpt = Seq( ApartmentOpt(Some("Paris"), Some(50), Some(300000.0), None), ApartmentOpt(None, None, Some(450000.0), Some(3)) ) ``` ```scala mdoc val aptTypedDsOpt = TypedDataset.create(apartmentsOpt) aptTypedDsOpt.show().run() ``` Unfortunately the syntax used above with `select()` will not work here: ```scala mdoc:fail aptTypedDsOpt.select(aptTypedDsOpt('surface) * 10, aptTypedDsOpt('surface) + 2).show().run() ``` This is because we cannot multiple an `Option` with an `Int`. In Scala, `Option` has a `map()` method to help address exactly this (e.g., `Some(10).map(c => c * 2)`). Frameless follows a similar convention. By applying the `opt` method on any `Option[X]` column you can then use `map()` to provide a function that works with the unwrapped type `X`. This is best shown in the example bellow: ```scala mdoc aptTypedDsOpt.select(aptTypedDsOpt('surface).opt.map(c => c * 10), aptTypedDsOpt('surface).opt.map(_ + 2)).show().run() ``` **Known issue**: `map()` will throw a runtime exception when the applied function includes a `udf()`. If you want to apply a `udf()` to an optional column, we recommend changing your `udf` to work directly with `Optional` fields. ## Casting and projections In the general case, `select()` returns a TypedDataset of type `TypedDataset[TupleN[...]]` (with N in `[1...10]`). For example, if we select three columns with types `String`, `Int`, and `Boolean` the result will have type `TypedDataset[(String, Int, Boolean)]`. We often want to give more expressive types to the result of our computations. `as[T]` allows us to safely cast a `TypedDataset[U]` to another of type `TypedDataset[T]` as long as the types in `U` and `T` align. When the cast is valid the expression compiles: ```scala mdoc case class UpdatedSurface(city: String, surface: Int) val updated = aptTypedDs.select(aptTypedDs('city), aptTypedDs('surface) + 2).as[UpdatedSurface] updated.show(2).run() ``` Next we try to cast a `(String, String)` to an `UpdatedSurface` (which has types `String`, `Int`). The cast is not valid and the expression does not compile: ```scala mdoc:fail aptTypedDs.select(aptTypedDs('city), aptTypedDs('city)).as[UpdatedSurface] ``` ### Advanced topics with `select()` When you `select()` a single column that has type `A`, the resulting type is `TypedDataset[A]` and not `TypedDataset[Tuple1[A]]`. This behavior makes working with nested schema easier (i.e., in the case where `A` is a complex data type) and simplifies type-checking column operations (e.g., verify that two columns can be added, divided, etc.). However, when `A` is scalar, say a `Long`, it makes it harder to select and work with the resulting `TypedDataset[Long]`. For instance, it's harder to reference this single scalar column using `select()`. If this becomes an issue, you can bypass this behavior by using the `selectMany()` method instead of `select()`. In the previous example, `selectMany()` will return `TypedDataset[Tuple1[Long]]` and you can reference its single column using the name `_1`. `selectMany()` should also be used when you need to select more than 10 columns. `select()` has better IDE support and compiles faster than the macro based `selectMany()`, so prefer `select()` for the most common use cases. When you are handed a single scalar column TypedDataset (e.g., `TypedDataset[Double]`) the best way to reference its single column is using the `asCol` (short for "as a column") method. This is best shown in the example below. We will see more usages of `asCol` later in this tutorial. ```scala mdoc:nest val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface).cast[Double]) priceBySurfaceUnit.select(priceBySurfaceUnit.asCol * 2).show(2).run() ``` ### Projections We often want to work with a subset of the fields in a dataset. Projections allow us to easily select our fields of interest while preserving their initial names and types for extra safety. Here is an example using the `TypedDataset[Apartment]` with an additional column: ```scala mdoc val aptds = aptTypedDs // For shorter expressions case class ApartmentDetails(city: String, price: Double, surface: Int, ratio: Double) val aptWithRatio = aptds.select( aptds('city), aptds('price), aptds('surface), aptds('price) / aptds('surface).cast[Double] ).as[ApartmentDetails] ``` Suppose we only want to work with `city` and `ratio`: ```scala mdoc case class CityInfo(city: String, ratio: Double) val cityRatio = aptWithRatio.project[CityInfo] cityRatio.show(2).run() ``` Suppose we only want to work with `price` and `ratio`: ```scala mdoc case class PriceInfo(ratio: Double, price: Double) val priceInfo = aptWithRatio.project[PriceInfo] priceInfo.show(2).run() ``` We see that the order of the fields does not matter as long as the names and the corresponding types agree. However, if we make a mistake in any of the names and/or their types, then we get a compilation error. Say we make a typo in a field name: ```scala mdoc:silent case class PriceInfo2(ratio: Double, pricEE: Double) ``` ```scala mdoc:fail aptWithRatio.project[PriceInfo2] ``` Say we make a mistake in the corresponding type: ```scala mdoc:silent case class PriceInfo3(ratio: Int, price: Double) // ratio should be Double ``` ```scala mdoc:fail aptWithRatio.project[PriceInfo3] ``` ### Union of TypedDatasets Lets create a projection of our original dataset with a subset of the fields. ```scala mdoc:nest:silent case class ApartmentShortInfo(city: String, price: Double, bedrooms: Int) val aptTypedDs2: TypedDataset[ApartmentShortInfo] = aptTypedDs.project[ApartmentShortInfo] ``` The union of `aptTypedDs2` with `aptTypedDs` uses all the fields of the caller (`aptTypedDs2`) and expects the other dataset (`aptTypedDs`) to include all those fields. If field names/types do not match you get a compilation error. ```scala mdoc aptTypedDs2.union(aptTypedDs).show().run ``` The other way around will not compile, since `aptTypedDs2` has only a subset of the fields. ```scala mdoc:fail aptTypedDs.union(aptTypedDs2).show().run ``` Finally, as with `project`, `union` will align fields that have same names/types, so fields do not have to be in the same order. ## TypedDataset functions and transformations Frameless supports many of Spark's functions and transformations. However, whenever a Spark function does not exist in Frameless, calling `.dataset` will expose the underlying `Dataset` (from org.apache.spark.sql, the original Spark APIs), where you can use anything that would be missing from the Frameless' API. These are the main imports for Frameless' aggregate and non-aggregate functions. ```scala import frameless.functions._ // For literals import frameless.functions.nonAggregate._ // e.g., concat, abs import frameless.functions.aggregate._ // e.g., count, sum, avg ``` ### Drop/Replace/Add fields `dropTupled()` drops a single column and results in a tuple-based schema. ```scala mdoc aptTypedDs2.dropTupled('price): TypedDataset[(String,Int)] ``` To drop a column and specify a new schema use `drop()`. ```scala mdoc case class CityBeds(city: String, bedrooms: Int) val cityBeds: TypedDataset[CityBeds] = aptTypedDs2.drop[CityBeds] ``` Often, you want to replace an existing column with a new value. ```scala mdoc val inflation = aptTypedDs2.withColumnReplaced('price, aptTypedDs2('price) * 2) inflation.show(2).run() ``` Or use a literal instead. ```scala mdoc import frameless.functions.lit aptTypedDs2.withColumnReplaced('price, lit(0.001)) ``` Adding a column using `withColumnTupled()` results in a tupled-based schema. ```scala mdoc aptTypedDs2.withColumnTupled(lit(Array("a","b","c"))).show(2).run() ``` Similarly, `withColumn()` adds a column and explicitly expects a schema for the result. ```scala mdoc case class CityBedsOther(city: String, bedrooms: Int, other: List[String]) cityBeds. withColumn[CityBedsOther](lit(List("a","b","c"))). show(1).run() ``` To conditionally change a column use the `when/otherwise` operation. ```scala mdoc import frameless.functions.nonAggregate.when aptTypedDs2.withColumnTupled( when(aptTypedDs2('city) === "Paris", aptTypedDs2('price)). when(aptTypedDs2('city) === "Lyon", lit(1.1)). otherwise(lit(0.0))).show(8).run() ``` A simple way to add a column without losing important schema information is to project the entire source schema into a single column using the `asCol()` method. ```scala mdoc val c = cityBeds.select(cityBeds.asCol, lit(List("a","b","c"))) c.show(1).run() ``` When working with Spark's `DataFrames`, you often select all columns using `.select($"*", ...)`. In a way, `asCol()` is a typed equivalent of `$"*"`. To access nested columns, use the `colMany()` method. ```scala mdoc c.select(c.colMany('_1, 'city), c('_2)).show(2).run() ``` ### Working with collections ```scala mdoc import frameless.functions._ import frameless.functions.nonAggregate._ ``` ```scala mdoc val t = cityRatio.select(cityRatio('city), lit(List("abc","c","d"))) t.withColumnTupled( arrayContains(t('_2), "abc") ).show(1).run() ``` If accidentally you apply a collection function on a column that is not a collection, you get a compilation error. ```scala mdoc:fail t.withColumnTupled( arrayContains(t('_1), "abc") ) ``` Flattening columns in Spark is done with the `explode()` method. Unlike vanilla Spark, in Frameless `explode()` is part of `TypedDataset` and not a function of a column. This provides additional safety since more than one `explode()` applied in a single statement results in runtime error in vanilla Spark. ```scala mdoc val t2 = cityRatio.select(cityRatio('city), lit(List(1,2,3,4))) val flattened = t2.explode('_2): TypedDataset[(String, Int)] flattened.show(4).run() ``` Here is an example of how `explode()` may fail in vanilla Spark. The Frameless implementation does not suffer from this problem since, by design, it can only be applied to a single column at a time. ```scala mdoc:fail { import org.apache.spark.sql.functions.{explode => sparkExplode} t2.dataset.toDF().select(sparkExplode($"_2"), sparkExplode($"_2")) } ``` ### Collecting data to the driver In Frameless all Spark actions (such as `collect()`) are safe. Take the first element from a dataset (if the dataset is empty return `None`). ```scala mdoc cityBeds.headOption.run() ``` Take the first `n` elements. ```scala mdoc cityBeds.take(2).run() ``` ```scala mdoc cityBeds.head(3).run() ``` ```scala mdoc cityBeds.limit(4).collect().run() ``` ## Sorting columns Only column types that can be sorted are allowed to be selected for sorting. ```scala mdoc aptTypedDs.orderBy(aptTypedDs('city).asc).show(2).run() ``` The ordering can be changed by selecting `.acs` or `.desc`. ```scala mdoc aptTypedDs.orderBy( aptTypedDs('city).asc, aptTypedDs('price).desc ).show(2).run() ``` ## User Defined Functions Frameless supports lifting any Scala function (up to five arguments) to the context of a particular `TypedDataset`: ```scala mdoc:nest // The function we want to use as UDF val priceModifier = (name: String, price:Double) => if(name == "Paris") price * 2.0 else price val udf = aptTypedDs.makeUDF(priceModifier) val aptds = aptTypedDs // For shorter expressions val adjustedPrice = aptds.select(aptds('city), udf(aptds('city), aptds('price))) adjustedPrice.show().run() ``` ## GroupBy and Aggregations Let's suppose we wanted to retrieve the average apartment price in each city ```scala mdoc val priceByCity = aptTypedDs.groupBy(aptTypedDs('city)).agg(avg(aptTypedDs('price))) priceByCity.collect().run() ``` Again if we try to aggregate a column that can't be aggregated, we get a compilation error ```scala mdoc:fail aptTypedDs.groupBy(aptTypedDs('city)).agg(avg(aptTypedDs('city))) ``` Next, we combine `select` and `groupBy` to calculate the average price/surface ratio per city: ```scala mdoc:nest val aptds = aptTypedDs // For shorter expressions val cityPriceRatio = aptds.select(aptds('city), aptds('price) / aptds('surface).cast[Double]) cityPriceRatio.groupBy(cityPriceRatio('_1)).agg(avg(cityPriceRatio('_2))).show().run() ``` We can also use `pivot` to further group data on a secondary column. For example, we can compare the average price across cities by number of bedrooms. ```scala mdoc case class BedroomStats( city: String, AvgPriceBeds1: Option[Double], // Pivot values may be missing, so we encode them using Options AvgPriceBeds2: Option[Double], AvgPriceBeds3: Option[Double], AvgPriceBeds4: Option[Double]) val bedroomStats = aptds. groupBy(aptds('city)). pivot(aptds('bedrooms)). on(1,2,3,4). // We only care for up to 4 bedrooms agg(avg(aptds('price))). as[BedroomStats] // Typesafe casting bedroomStats.show().run() ``` With pivot, collecting data preserves typesafety by encoding potentially missing columns with `Option`. ```scala mdoc bedroomStats.collect().run().foreach(println) ``` #### Working with Optional fields Optional fields can be converted to non-optional using `getOrElse()`. ```scala mdoc val sampleStats = bedroomStats.select( bedroomStats('AvgPriceBeds2).getOrElse(0.0), bedroomStats('AvgPriceBeds3).getOrElse(0.0)) sampleStats.show().run() ``` In addition, optional columns can be flatten using the `.flattenOption` method on `TypedDatset`. The result contains the rows for which the flattened column is not None (or null). The schema is automatically adapted to reflect this change. ```scala mdoc val flattenStats = bedroomStats.flattenOption('AvgPriceBeds2) // The second Option[Double] is now of type Double, since all 'null' values are removed flattenStats: TypedDataset[(String, Option[Double], Double, Option[Double], Option[Double])] ``` In a DataFrame, if you just ignore types, this would equivelantly be written as: ```scala mdoc bedroomStats.dataset.toDF().filter($"AvgPriceBeds2".isNotNull) ``` ### Entire TypedDataset Aggregation We often want to aggregate the entire `TypedDataset` and skip the `groupBy()` clause. In Frameless you can do this using the `agg()` operator directly on the `TypedDataset`. In the following example, we compute the average price, the average surface, the minimum surface, and the set of cities for the entire dataset. ```scala mdoc case class Stats( avgPrice: Double, avgSurface: Double, minSurface: Int, allCities: Vector[String]) aptds.agg( avg(aptds('price)), avg(aptds('surface)), min(aptds('surface)), collectSet(aptds('city)) ).as[Stats].show().run() ``` You may apply any `TypedColumn` operation to a `TypedAggregate` column as well. ```scala mdoc import frameless.functions._ aptds.agg( avg(aptds('price)) * min(aptds('surface)).cast[Double], avg(aptds('surface)) * 0.2, litAggr("Hello World") ).show().run() ``` ## Joins ```scala mdoc:silent case class CityPopulationInfo(name: String, population: Int) val cityInfo = Seq( CityPopulationInfo("Paris", 2229621), CityPopulationInfo("Lyon", 500715), CityPopulationInfo("Nice", 343629) ) val citiInfoTypedDS = TypedDataset.create(cityInfo) ``` Here is how to join the population information to the apartment's dataset: ```scala mdoc val withCityInfo = aptTypedDs.joinInner(citiInfoTypedDS) { aptTypedDs('city) === citiInfoTypedDS('name) } withCityInfo.show().run() ``` The joined TypedDataset has type `TypedDataset[(Apartment, CityPopulationInfo)]`. We can then select which information we want to continue to work with: ```scala mdoc case class AptPriceCity(city: String, aptPrice: Double, cityPopulation: Int) withCityInfo.select( withCityInfo.colMany('_2, 'name), withCityInfo.colMany('_1, 'price), withCityInfo.colMany('_2, 'population) ).as[AptPriceCity].show().run ``` ### Chained Joins Joins, or any similar operation, may be chained using a thrush combinator removing the need for intermediate values. Instead of: ```scala mdoc val withBedroomInfoInterim = aptTypedDs.joinInner(citiInfoTypedDS)( aptTypedDs('city) === citiInfoTypedDS('name) ) val withBedroomInfo = withBedroomInfoInterim .joinLeft(bedroomStats)( withBedroomInfoInterim.col('_1).field('city) === bedroomStats('city) ) withBedroomInfo.show().run() ``` You can use thrush from [mouse](https://github.com/typelevel/mouse): ```scala libraryDependencies += "org.typelevel" %% "mouse" % "1.2.1" ``` ```scala mdoc import mouse.all._ val withBedroomInfoChained = aptTypedDs.joinInner(citiInfoTypedDS)( aptTypedDs('city) === citiInfoTypedDS('name) ) .thrush( interim => interim.joinLeft(bedroomStats)( interim.col('_1).field('city) === bedroomStats('city) ) ) withBedroomInfoChained.show().run() ``` ```scala mdoc:invisible spark.stop() ``` ================================================ FILE: docs/Injection.md ================================================ # Injection: Creating Custom Encoders ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import frameless.functions.aggregate._ import frameless.TypedDataset val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ ``` Injection lets us define encoders for types that do not have one by injecting `A` into an encodable type `B`. This is the definition of the injection typeclass: ```scala trait Injection[A, B] extends Serializable { def apply(a: A): B def invert(b: B): A } ``` ## Example Let's define a simple case class: ```scala mdoc case class Person(age: Int, birthday: java.util.Calendar) val people = Seq(Person(42, new java.util.GregorianCalendar())) ``` And an instance of a `TypedDataset`: ```scala mdoc:fail:nest val personDS = TypedDataset.create(people) ``` Looks like we can't, a `TypedEncoder` instance of `Person` is not available, or more precisely for `java.util.Calendar`. But we can define a injection from `java.util.Calendar` to an encodable type, like `Long`: ```scala mdoc import java.util.Calendar import frameless._ implicit val calendarToLongInjection = new Injection[Calendar, Long] { def apply(d: Calendar): Long = d.getTime.getTime def invert(l: Long): Calendar = { val cal = new java.util.GregorianCalendar() cal.setTime(new java.util.Date(l)) cal } } ``` We can be less verbose using the `Injection.apply` function: ```scala mdoc:nest import frameless._ import java.util.Calendar implicit val calendarToLongInjection = Injection[Calendar, Long]( (_: Calendar).getTime.getTime, { (l: Long) => val cal = new java.util.GregorianCalendar() cal.setTime(new java.util.Date(l)) cal }) ``` Now we can create our `TypedDataset`: ```scala mdoc val personDS = TypedDataset.create(people) ``` ```scala mdoc:invisible spark.stop() ``` ## Another example ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import frameless.functions.aggregate._ import frameless.TypedDataset val conf = new SparkConf(). setMaster("local[*]"). setAppName("frameless repl"). set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder(). config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ ``` Let's define a sealed family: ```scala mdoc sealed trait Gender case object Male extends Gender case object Female extends Gender case object Other extends Gender ``` And a simple case class: ```scala mdoc case class Person(age: Int, gender: Gender) val people = Seq(Person(42, Male)) ``` Again if we try to create a `TypedDataset`, we get a compilation error. ```scala mdoc:fail:nest val personDS = TypedDataset.create(people) ``` Let's define an injection instance for `Gender`: ```scala mdoc import frameless._ implicit val genderToInt: Injection[Gender, Int] = Injection( { case Male => 1 case Female => 2 case Other => 3 }, { case 1 => Male case 2 => Female case 3 => Other }) ``` And now we can create our `TypedDataset`: ```scala mdoc val personDS = TypedDataset.create(people) ``` ```scala mdoc:invisible spark.stop() ``` Alternatively, an injection instance can be derived for sealed families such as `Gender` using the following import, `import frameless.TypedEncoder.injections._`. This will encode the data constructors as strings. **Known issue**: An invalid injection instance will be derived if there are data constructors with the same name. For example, consider the following sealed family: ```scala mdoc sealed trait Foo object A { case object Bar extends Foo } object B { case object Bar extends Foo } ``` `A.Bar` and `B.Bar` will both be encoded as `"Bar"` thereby breaking the law that `invert(apply(x)) == x`. ================================================ FILE: docs/Job.md ================================================ # Job\[A\] All operations on `TypedDataset` are lazy. An operation either returns a new transformed `TypedDataset` or an `F[A]`, where `F[_]` is a type constructor with an instance of the `SparkDelay` typeclass and `A` is the result of running a non-lazy computation in Spark. A default such type constructor called `Job` is provided by Frameless. `Job` serves several functions: - Makes all operations on a `TypedDataset` lazy, which makes them more predictable compared to having few operations being lazy and other being strict - Allows the programmer to make expensive blocking operations explicit - Allows for Spark jobs to be lazily sequenced using monadic composition via for-comprehension - Provides an obvious place where you can annotate/name your Spark jobs to make it easier to track different parts of your application in the Spark UI The toy example showcases the use of for-comprehension to explicitly sequences Spark Jobs. First we calculate the size of the `TypedDataset` and then we collect to the driver exactly 20% of its elements: ```scala mdoc:invisible import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import frameless.functions.aggregate._ import frameless.TypedDataset val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ ``` ```scala mdoc import frameless.syntax._ val ds = TypedDataset.create(1 to 20) val countAndTakeJob = for { count <- ds.count() sample <- ds.take((count/5).toInt) } yield sample countAndTakeJob.run() ``` The `countAndTakeJob` can either be executed using `run()` (as we show above) or it can be passed along to other parts of the program to be further composed into more complex sequences of Spark jobs. ```scala mdoc import frameless.Job def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min) val finalJob = computeMinOfSample(countAndTakeJob) ``` Now we can execute this new job by specifying a [group-id][group-id] and a description. This allows the programmer to see this information on the Spark UI and help track, say, performance issues. ```scala mdoc finalJob. withGroupId("samplingJob"). withDescription("Samples 20% of elements and computes the min"). run() ``` ```scala mdoc:invisible spark.stop() ``` [group-id]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.SparkContext@setJobGroup(groupId:String,description:String,interruptOnCancel:Boolean):Unit ## More on `SparkDelay` As mentioned above, `SparkDelay[F[_]]` is a typeclass required for suspending effects by Spark computations. This typeclass represents the ability to suspend an `=> A` thunk into an `F[A]` value, while implicitly capturing a `SparkSession`. As it is a typeclass, it is open for implementation by the user in order to use other data types for suspension of effects. The `cats` module, for example, uses this typeclass to support suspending Spark computations in any effect type that has a `cats.effect.Sync` instance. ================================================ FILE: docs/TypedDataFrame.md ================================================ # Proof of Concept: TypedDataFrame `TypedDataFrame` is the API developed in the early stages of Frameless to manipulate Spark `DataFrame`s in a type-safe manner. With the introduction of `Dataset` in Spark 1.6, `DataFrame` seems deprecated and won't be the focus of future development of Frameless. However, the design is interesting enough to document. To safely manipulate `DataFrame`s we use a technique called a *shadow type*, which consists in storing additional information about a value in a "dummy" type. Mirroring value-level computation at the type-level lets us leverage the type system to catch common mistakes at compile time. ### Diving in In `TypedDataFrame`, we use a single `Schema <: Product` to model the number, the types and the names of columns. Here is a what the definition of `TypedDataFrame` looks like, with simplified type signatures: ```scala import org.apache.spark.sql.DataFrame import shapeless.HList class TDataFrame[Schema <: Product](df: DataFrame) { def filter(predicate: Schema => Boolean): TDataFrame[Schema] = ??? def select[C <: HList, Out <: Product](columns: C): TDataFrame[Out] = ??? def innerJoin[OtherS <: Product, Out <: Product] (other: TDataFrame[OtherS]): TDataFrame[Out] = ??? // Followed by equivalent of every DataFrame method with improved signature } ``` As you can see, instead of the `def filter(conditionExpr: String): DataFrame` defined in Spark, the `TypedDataFrame` version expects a function from `Schema` to `Boolean`, and models the fact that resulting `DataFrame` will still hold elements of type `Schema`. ### Type-level column referencing For Spark's `DataFrame`s, column referencing is done directly by `String`s or using the `Column` type which provides no additional type safety. `TypedDataFrame` improves on that by catching invalid column references compile type. When everything goes well, Frameless select is very similar to vanilla select, except that it keeps track of the selected column types: ```scala import frameless.TypedDataFrame case class Foo(s: String, d: Double, i: Int) def selectIntString(tf: TypedDataFrame[Foo]): TypedDataFrame[(Int, String)] = tf.select('i, 's) ``` However, in case of typo, it gets caught right away: ```scala def selectIntStringTypo(tf: TypedDataFrame[Foo]): TypedDataFrame[(Int, String)] = tf.select('j, 's) ``` ### Type-level joins Joins can available with two different syntaxes. The first lets you reference different columns on each `TypedDataFrame`, and ensures that they all exist and have compatible types: ```scala case class Bar(i: Int, j: String, b: Boolean) def join1(tf1: TypedDataFrame[Foo], tf2: TypedDataFrame[Bar]) : TypedDataFrame[(String, Double, Int, Int, String, Boolean)] = tf1.innerJoin(tf2).on('s).and('j) ``` The second syntax brings some convenience when the joining columns have identical names in both tables: ```scala def join2(tf1: TypedDataFrame[Foo], tf2: TypedDataFrame[Bar]) : TypedDataFrame[(String, Double, Int, String, Boolean)] = tf1.innerJoin(tf2).using('i) ``` Further example are available in the [TypedDataFrame join tests.](https://github.com/typelevel/frameless/blob/17194d2172e75f8994e9481181e85b4c8dcc0f69/dataframe/src/test/scala/JoinTests.scala) ### Complete example We now consider a complete example to see how the Frameless types can improve not only correctness but also the readability of Spark jobs. Consider the following domain of phonebooks, city maps and neighborhoods: ```scala mdoc:silent type Neighborhood = String type Address = String case class PhoneBookEntry( address: Address, residents: String, phoneNumber: Double ) case class CityMapEntry( address: Address, neighborhood: Neighborhood ) ``` Our goal will be to compute the neighborhood with unique names, approximating "unique" with names containing less common letters in the alphabet: 'x', 'q', and 'z'. We are going to need a natural language processing library at some point, so let's use the following for the example: ```scala mdoc:silent object NLPLib { def uniqueName(name: String): Boolean = name.exists(Set('x', 'q', 'z')) } ``` Suppose we manage to obtain public data for a `TypedDataFrame[PhoneBookEntry]` and `TypedDataFrame[CityMapEntry]`. Here is what our Spark job could look like with Frameless: ```scala import org.apache.spark.sql.SQLContext // These case classes are used to hold intermediate results case class Family(residents: String, neighborhood: Neighborhood) case class Person(name: String, neighborhood: Neighborhood) case class NeighborhoodCount(neighborhood: Neighborhood, count: Long) def bestNeighborhood (phoneBookTF: TypedDataFrame[PhoneBookEntry], cityMapTF: TypedDataFrame[CityMapEntry]) (implicit c: SQLContext): String = { ((((((((( phoneBookTF .innerJoin(cityMapTF).using('address) :TypedDataFrame[(Address, String, Double, String)]) .select('_2, '_4) :TypedDataFrame[(String, String)]) .as[Family]() :TypedDataFrame[Family]) .flatMap { f => f.residents.split(' ').map(r => Person(r, f.neighborhood)) } :TypedDataFrame[Person]) .filter { p => NLPLib.uniqueName(p.name) } :TypedDataFrame[Person]) .groupBy('neighborhood).count() :TypedDataFrame[(String, Long)]) .as[NeighborhoodCount]() :TypedDataFrame[NeighborhoodCount]) .sortDesc('count) :TypedDataFrame[NeighborhoodCount]) .select('neighborhood) :TypedDataFrame[Tuple1[String]]) .head._1 } ``` If you compare this version to vanilla Spark where every line is a `DataFrame`, you see how much types can improve readability. An executable version of this example is available in the [BestNeighborhood test](https://github.com/typelevel/frameless/blob/17194d2172e75f8994e9481181e85b4c8dcc0f69/dataframe/src/test/scala/BestNeighborhood.scala). ### Limitations The main limitation of this approach comes from Scala 2.10, which limits the arity of class classes to 22. Because of the way `DataFrame` models joins, joining two table with more that 11 fields results in a `DataFrame` which not representable with `Schema` of type `Product`. In the `Dataset` API introduced in Spark 1.6, the way join are handled was rethought to return a pair of both schemas instead of a flat table, which moderates the trouble caused by case class limitations. Alternatively, since Scala 2.11, it is possible to define Tuple23 and onward. Sadly, due to the way Spark is commonly packaged in various systems, the amount Spark users having to Scala 2.11 and *not* to Spark 1.6 is essentially zero. For this reasons, further development in Frameless will target Spark 1.6+, deprecating the early work on`TypedDataFrame`. ================================================ FILE: docs/TypedDatasetVsSparkDataset.md ================================================ # Comparing TypedDatasets with Spark's Datasets ```scala mdoc:invisible:reset-object import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession val conf = new SparkConf().setMaster("local[*]").setAppName("test").set("spark.ui.enabled", "false").set("spark.app.id", "tut-dataset") implicit val spark = SparkSession.builder().config(conf).getOrCreate() System.clearProperty("spark.master.port") System.clearProperty("spark.driver.port") System.clearProperty("spark.hostPort") System.setProperty("spark.cleaner.ttl", "300") // We are using this directory so let's make sure it is clean first org.apache.commons.io.FileUtils.deleteDirectory(new java.io.File("/tmp/foo/")) ``` **Goal:** This tutorial compares the standard Spark Datasets API with the one provided by Frameless' `TypedDataset`. It shows how `TypedDataset`s allow for an expressive and type-safe api with no compromises on performance. For this tutorial we first create a simple dataset and save it on disk as a parquet file. [Parquet](https://parquet.apache.org/) is a popular columnar format and well supported by Spark. It's important to note that when operating on parquet datasets, Spark knows that each column is stored separately, so if we only need a subset of the columns Spark will optimize for this and avoid reading the entire dataset. This is a rather simplistic view of how Spark and parquet work together but it will serve us well for the context of this discussion. ```scala mdoc import spark.implicits._ // Our example case class Foo acting here as a schema case class Foo(i: Long, j: String) // Assuming spark is loaded and SparkSession is bind to spark val initialDs = spark.createDataset( Foo(1, "Q") :: Foo(10, "W") :: Foo(100, "E") :: Nil ) // Assuming you are on Linux or Mac OS initialDs.write.parquet("/tmp/foo") val ds = spark.read.parquet("/tmp/foo").as[Foo] ds.show() ``` The value `ds` holds the content of the `initialDs` read from a parquet file. Let's try to only use field `i` from Foo and see how Spark's Catalyst (the query optimizer) optimizes this. ```scala mdoc // Using a standard Spark TypedColumn in select() val filteredDs = ds.filter($"i" === 10).select($"i".as[Long]) filteredDs.show() ``` The `filteredDs` is of type `Dataset[Long]`. Since we only access field `i` from `Foo` the type is correct. Unfortunately, this syntax requires handholding by explicitly setting the `TypedColumn` in the `select` statement to return type `Long` (look at the `as[Long]` statement). We will discuss this limitation next in more detail. Now, let's take a quick look at the optimized Physical Plan that Spark's Catalyst generated. ```scala mdoc filteredDs.explain() ``` The last line is very important (see `ReadSchema`). The schema read from the parquet file only required reading column `i` without needing to access column `j`. This is great! We have both an optimized query plan and type-safety! Unfortunately, this syntax is not bulletproof: it fails at run-time if we try to access a non existing column `x`: ```scala mdoc:crash ds.filter($"i" === 10).select($"x".as[Long]) ``` There are two things to improve here. First, we would want to avoid the `as[Long]` casting that we are required to type for type-safety. This is clearly an area where we may introduce a bug by casting to an incompatible type. Second, we want a solution where reference to a non existing column name fails at compilation time. The standard Spark Dataset can achieve this using the following syntax. ```scala mdoc ds.filter(_.i == 10).map(_.i).show() ``` This looks great! It reminds us the familiar syntax from Scala. The two closures in filter and map are functions that operate on `Foo` and the compiler will helps us capture all the mistakes we mentioned above. ```scala mdoc:fail ds.filter(_.i == 10).map(_.x).show() ``` Unfortunately, this syntax does not allow Spark to optimize the code. ```scala mdoc ds.filter(_.i == 10).map(_.i).explain() ``` As we see from the explained Physical Plan, Spark was not able to optimize our query as before. Reading the parquet file will required loading all the fields of `Foo`. This might be ok for small datasets or for datasets with few columns, but will be extremely slow for most practical applications. Intuitively, Spark currently does not have a way to look inside the code we pass in these two closures. It only knows that they both take one argument of type `Foo`, but it has no way of knowing if we use just one or all of `Foo`'s fields. The `TypedDataset` in Frameless solves this problem. It allows for a simple and type-safe syntax with a fully optimized query plan. ```scala mdoc import frameless.TypedDataset import frameless.syntax._ val fds = TypedDataset.create(ds) fds.filter(fds('i) === 10).select(fds('i)).show().run() ``` And the optimized Physical Plan: ```scala mdoc fds.filter(fds('i) === 10).select(fds('i)).explain() ``` And the compiler is our friend. ```scala mdoc:fail fds.filter(fds('i) === 10).select(fds('x)) ``` ## Differences in Encoders Encoders in Spark's `Datasets` are partially type-safe. If you try to create a `Dataset` using a type that is not a Scala `Product` then you get a compilation error: ```scala mdoc class Bar(i: Int) ``` `Bar` is neither a case class nor a `Product`, so the following correctly gives a compilation error in Spark: ```scala mdoc:fail spark.createDataset(Seq(new Bar(1))) ``` However, the compile type guards implemented in Spark are not sufficient to detect non encodable members. For example, using the following case class leads to a runtime failure: ```scala mdoc case class MyDate(jday: java.util.Calendar) ``` ```scala mdoc:crash spark.createDataset(Seq(MyDate { val cal = new java.util.GregorianCalendar() cal.setTime(new java.util.Date(System.currentTimeMillis)) cal })) ``` In comparison, a `TypedDataset` will notify about the encoding problem at compile time: ```scala mdoc:fail TypedDataset.create(Seq(MyDate { val cal = new java.util.GregorianCalendar() cal.setTime(new java.util.Date(System.currentTimeMillis)) cal })) ``` ## Aggregate vs Projected columns Spark's `Dataset` do not distinguish between columns created from aggregate operations, such as summing or averaging, and simple projections/selections. This is problematic when you start mixing the two. ```scala mdoc import org.apache.spark.sql.functions.sum ``` ```scala mdoc:crash ds.select(sum($"i"), $"i"*2) ``` In Frameless, mixing the two results in a compilation error. ```scala mdoc // To avoid confusing frameless' sum with the standard Spark's sum import frameless.functions.aggregate.{sum => fsum} ``` ```scala mdoc:fail fds.select(fsum(fds('i))) ``` As the error suggests, we expected a `TypedColumn` but we got a `TypedAggregate` instead. Here is how you apply an aggregation method in Frameless: ```scala mdoc fds.agg(fsum(fds('i))+22).show().run() ``` Similarly, mixing projections while aggregating does not make sense, and in Frameless you get a compilation error. ```scala mdoc:fail fds.agg(fsum(fds('i)), fds('i)).show().run() ``` ```scala mdoc:invisible org.apache.commons.io.FileUtils.deleteDirectory(new java.io.File("/tmp/foo/")) spark.stop() ``` ================================================ FILE: docs/TypedEncoder.md ================================================ # Typed Encoders in Frameless ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession import frameless.functions.aggregate._ val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() System.setProperty("spark.cleaner.ttl", "300") ``` Spark uses Reflection to derive its `Encoder`s, which is why they can fail at run time. For example, because Spark does not support `java.util.Calendar`, the following leads to an error: ```scala mdoc:silent import java.util.Calendar import org.apache.spark.sql.Dataset import spark.implicits._ case class DateRange(s: Calendar, e: Calendar) ``` ```scala mdoc:crash def now = new java.util.GregorianCalendar() val ds: Dataset[DateRange] = Seq(DateRange(now, now)).toDS() ``` As shown by the stack trace, this runtime error goes through [ScalaReflection](https://github.com/apache/spark/blob/19cf208063f035d793d2306295a251a9af7e32f6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala) to try to derive an `Encoder` for `Dataset` schema. Beside the annoyance of not detecting this error at compile time, a more important limitation of the reflection-based approach is its inability to be extended for custom types. See this Stack Overflow question for a summary of the current situation (as of 2.0) in vanilla Spark: [How to store custom objects in a Dataset?](http://stackoverflow.com/a/39442829/2311362). Frameless introduces a new type class called `TypeEncoder` to solve these issues. `TypeEncoder`s are passed around as implicit parameters to every Frameless method to ensure that the data being manipulated is `Encoder`. It uses a standard implicit resolution coupled with shapeless' type class derivation mechanism to ensure every that compiling code manipulates encodable data. For example, the `java.util.Calendar` example won't compile with Frameless: ```scala mdoc:silent import frameless.TypedDataset import frameless.syntax._ ``` ```scala mdoc:fail def now = new java.util.GregorianCalendar() val ds: TypedDataset[DateRange] = TypedDataset.create(Seq(DateRange(now, now))) ``` Type class derivation takes care of recursively constructing (and proving the existence of) `TypeEncoder`s for case classes. The following works as expected: ```scala mdoc case class Bar(d: Double, s: String) case class Foo(i: Int, b: Bar) val ds: TypedDataset[Foo] = TypedDataset.create(Seq(Foo(1, Bar(1.1, "s")))) ds.collect() ``` But any non-encodable in the case class hierarchy will be detected at compile time: ```scala mdoc:silent case class BarDate(d: Double, s: String, t: java.util.Calendar) case class FooDate(i: Int, b: BarDate) ``` ```scala mdoc:fail val ds: TypedDataset[FooDate] = TypedDataset.create( Seq(FooDate(1, BarDate(1.1, "s", new java.util.GregorianCalendar)))) ``` It should be noted that once derived, reflection-based `Encoder`s and implicitly derived `TypeEncoder`s have identical performance. The derivation mechanism is different, but the objects generated to encode and decode JVM objects in Spark's internal representation behave the same at runtime. ```scala mdoc:invisible spark.stop() ``` ================================================ FILE: docs/TypedML.md ================================================ # Typed Spark ML The `frameless-ml` module provides a strongly typed Spark ML API leveraging `TypedDataset`s. It introduces `TypedTransformer`s and `TypedEstimator`s, the type-safe equivalents of Spark ML's `Transformer` and `Estimator`. A `TypedEstimator` fits models to data, i.e trains a ML model based on an input `TypedDataset`. A `TypedTransformer` transforms one `TypedDataset` into another, usually by appending column(s) to it. By calling the `fit` method of a `TypedEstimator`, the `TypedEstimator` will train a ML model using the `TypedDataset` passed as input (representing the training data) and will return a `TypedTransformer` that represents the trained model. This `TypedTransformer`can then be used to make predictions on an input `TypedDataset` (representing the test data) using the `transform` method that will return a new `TypedDataset` with appended prediction column(s). Both `TypedEstimator` and `TypedTransformer` check at compile-time the correctness of their inputs field names and types, contrary to Spark ML API which only deals with DataFrames (the data structure with the lowest level of type-safety in Spark). `frameless-ml` adds type-safety to Spark ML API but stays very close to it in terms of abstractions and API calls, so please check [Spark ML documentation](https://spark.apache.org/docs/2.2.0/ml-pipeline.html) for more details on `Transformer`s and `Estimator`s. ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ ``` ## Example 1: predict a continuous value using a `TypedRandomForestRegressor` In this example, we want to predict the sale price of a house depending on its square footage and the fact that the house has a garden or not. We will use a `TypedRandomForestRegressor`. ### Training As with the Spark ML API, we use a `TypedVectorAssembler` (the type-safe equivalent of `VectorAssembler`) to compute feature vectors: ```scala mdoc:silent import frameless._ import frameless.syntax._ import frameless.ml._ import frameless.ml.feature._ import frameless.ml.regression._ import org.apache.spark.ml.linalg.Vector ``` ```scala mdoc case class HouseData(squareFeet: Double, hasGarden: Boolean, price: Double) val trainingData = TypedDataset.create(Seq( HouseData(20, false, 100000), HouseData(50, false, 200000), HouseData(50, true, 250000), HouseData(100, true, 500000) )) case class Features(squareFeet: Double, hasGarden: Boolean) val assembler = TypedVectorAssembler[Features] case class HouseDataWithFeatures(squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector) val trainingDataWithFeatures = assembler.transform(trainingData).as[HouseDataWithFeatures] ``` In the above code snippet, `.as[HouseDataWithFeatures]` is a `TypedDataset`'s type-safe cast (see [TypedDataset: Feature Overview](https://typelevel.org/frameless/FeatureOverview.html)): ```scala mdoc:silent case class WrongHouseFeatures( squareFeet: Double, hasGarden: Int, // hasGarden has wrong type price: Double, features: Vector ) ``` ```scala mdoc:fail assembler.transform(trainingData).as[WrongHouseFeatures] ``` Moreover, `TypedVectorAssembler[Features]` will compile only if `Features` contains exclusively fields of type Numeric or Boolean: ```scala mdoc:silent case class WrongFeatures(squareFeet: Double, hasGarden: Boolean, city: String) ``` ```scala mdoc:fail TypedVectorAssembler[WrongFeatures] ``` The subsequent call `assembler.transform(trainingData)` compiles only if `trainingData` contains all fields (names and types) of `Features`: ```scala mdoc case class WrongHouseData(squareFeet: Double, price: Double) // hasGarden is missing val wrongTrainingData = TypedDataset.create(Seq(WrongHouseData(20, 100000))) ``` ```scala mdoc:fail assembler.transform(wrongTrainingData) ``` Then, we train the model. To train a Random Forest, one needs to feed it with features (what we predict from) and with a label (what we predict). In our example, `price` is the label, `features` are the features: ```scala mdoc case class RFInputs(price: Double, features: Vector) val rf = TypedRandomForestRegressor[RFInputs] val model = rf.fit(trainingDataWithFeatures).run() ``` `TypedRandomForestRegressor[RFInputs]` compiles only if `RFInputs` contains only one field of type Double (the label) and one field of type Vector (the features): ```scala mdoc:silent case class WrongRFInputs(labelOfWrongType: String, features: Vector) ``` ```scala mdoc:fail TypedRandomForestRegressor[WrongRFInputs] ``` The subsequent `rf.fit(trainingDataWithFeatures)` call compiles only if `trainingDataWithFeatures` contains the same fields (names and types) as RFInputs. ```scala mdoc val wrongTrainingDataWithFeatures = TypedDataset.create(Seq(HouseData(20, false, 100000))) // features are missing ``` ```scala mdoc:fail rf.fit(wrongTrainingDataWithFeatures) ``` ### Prediction We now want to predict `price` for `testData` using the previously trained model. Like the Spark ML API, `testData` has a default value for `price` (`0` in our case) that will be ignored at prediction time. We reuse our `assembler` to compute the feature vector of `testData`. ```scala mdoc val testData = TypedDataset.create(Seq(HouseData(70, true, 0))) val testDataWithFeatures = assembler.transform(testData).as[HouseDataWithFeatures] case class HousePricePrediction( squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector, predictedPrice: Double ) val predictions = model.transform(testDataWithFeatures).as[HousePricePrediction] predictions.select(predictions.col('predictedPrice)).collect.run() ``` `model.transform(testDataWithFeatures)` will only compile if `testDataWithFeatures` contains a field `price` of type Double and a field `features` of type Vector: ```scala mdoc:fail model.transform(testData) ``` ```scala mdoc:invisible spark.stop() ``` ## Example 2: predict a categorical value using a `TypedRandomForestClassifier` ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ import frameless._ import frameless.syntax._ import frameless.ml._ import frameless.ml.feature._ import frameless.ml.regression._ import org.apache.spark.ml.linalg.Vector ``` In this example, we want to predict in which city a house is located depending on its price and its square footage. We use a `TypedRandomForestClassifier`. ### Training As with the Spark ML API, we use a `TypedVectorAssembler` to compute feature vectors and a `TypedStringIndexer` to index `city` values in order to be able to pass them to a `TypedRandomForestClassifier` (which only accepts Double values as label): ```scala mdoc:silent import frameless.ml.classification._ ``` ```scala mdoc case class HouseData(squareFeet: Double, city: String, price: Double) val trainingData = TypedDataset.create(Seq( HouseData(100, "lyon", 100000), HouseData(200, "lyon", 200000), HouseData(100, "san francisco", 500000), HouseData(150, "san francisco", 900000) )) case class Features(price: Double, squareFeet: Double) val vectorAssembler = TypedVectorAssembler[Features] case class HouseDataWithFeatures(squareFeet: Double, city: String, price: Double, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingData).as[HouseDataWithFeatures] case class StringIndexerInput(city: String) val indexer = TypedStringIndexer[StringIndexerInput] indexer.estimator.setHandleInvalid("keep") val indexerModel = indexer.fit(dataWithFeatures).run() case class HouseDataWithFeaturesAndIndex( squareFeet: Double, city: String, price: Double, features: Vector, cityIndexed: Double ) val indexedData = indexerModel.transform(dataWithFeatures).as[HouseDataWithFeaturesAndIndex] ``` Then, we train the model: ```scala mdoc case class RFInputs(cityIndexed: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] val model = rf.fit(indexedData).run() ``` ### Prediction We now want to predict `city` for `testData` using the previously trained model. Like the Spark ML API, `testData` has a default value for `city` (empty string in our case) that will be ignored at prediction time. We reuse our `vectorAssembler` to compute the feature vector of `testData` and our `indexerModel` to index `city`. ```scala mdoc val testData = TypedDataset.create(Seq(HouseData(120, "", 800000))) val testDataWithFeatures = vectorAssembler.transform(testData).as[HouseDataWithFeatures] val indexedTestData = indexerModel.transform(testDataWithFeatures).as[HouseDataWithFeaturesAndIndex] case class HouseCityPredictionInputs(features: Vector, cityIndexed: Double) val testInput = indexedTestData.project[HouseCityPredictionInputs] case class HouseCityPredictionIndexed( features: Vector, cityIndexed: Double, rawPrediction: Vector, probability: Vector, predictedCityIndexed: Double ) val indexedPredictions = model.transform(testInput).as[HouseCityPredictionIndexed] ``` Then, we use a `TypedIndexToString` to get back a String value from `predictedCityIndexed`. `TypedIndexToString` takes as input the label array computed by our previous `indexerModel`: ```scala mdoc case class IndexToStringInput(predictedCityIndexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels) case class HouseCityPrediction( features: Vector, cityIndexed: Double, rawPrediction: Vector, probability: Vector, predictedCityIndexed: Double, predictedCity: String ) val predictions = indexToString.transform(indexedPredictions).as[HouseCityPrediction] predictions.select(predictions.col('predictedCity)).collect.run() ``` ## List of currently implemented `TypedEstimator`s * `TypedRandomForestClassifier` * `TypedRandomForestRegressor` * ... [your contribution here](https://github.com/typelevel/frameless/issues/215) ... :) ## List of currently implemented `TypedTransformer`s * `TypedIndexToString` * `TypedStringIndexer` * `TypedVectorAssembler` * ... [your contribution here](https://github.com/typelevel/frameless/issues/215) ... :) ## Using Vector and Matrix with `TypedDataset` `frameless-ml` provides `TypedEncoder` instances for `org.apache.spark.ml.linalg.Vector` and `org.apache.spark.ml.linalg.Matrix`: ```scala mdoc:silent import frameless._ import frameless.ml._ import org.apache.spark.ml.linalg._ ``` ```scala mdoc val vector = Vectors.dense(1, 2, 3) val vectorDs = TypedDataset.create(Seq("label" -> vector)) val matrix = Matrices.dense(2, 1, Array(1, 2)) val matrixDs = TypedDataset.create(Seq("label" -> matrix)) ``` Under the hood, Vector and Matrix are encoded using `org.apache.spark.ml.linalg.VectorUDT` and `org.apache.spark.ml.linalg.MatrixUDT`. This is possible thanks to the implicit derivation from `org.apache.spark.sql.types.UserDefinedType[A]` to `TypedEncoder[A]` defined in `TypedEncoder` companion object. ```scala mdoc:invisible spark.stop() ``` ================================================ FILE: docs/WorkingWithCsvParquetJson.md ================================================ # Working with CSV and Parquet data ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ val testDataPath: String = "docs/iris.data" ``` You need these imports for most Frameless projects. ```scala mdoc:silent import frameless._ import frameless.syntax._ import frameless.functions.aggregate._ ``` ## Working with CSV We first load some CSV data and print the schema. ```scala mdoc val df = spark.read.format("csv").load(testDataPath) df.show(2) df.printSchema ``` The easiest way to read from CSV into a `TypedDataset` is to create a case class that follows the exact number, type, and order for the fields as they appear in the CSV file. This is shown in the example bellow with the use of the `Iris` case class. ```scala mdoc final case class Iris(sLength: Double, sWidth: Double, pLength: Double, pWidth: Double, kind: String) val testDataDf = spark.read.format("csv").schema(TypedExpressionEncoder[Iris].schema).load(testDataPath) val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataDf) data.show(2).run() ``` If we do not explicitly define the schema of the CSV file then the types will not match leading to runtime errors. ```scala mdoc:nest val testDataNoSchema = spark.read.format("csv").load(testDataPath) val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataNoSchema) ``` ```scala mdoc:crash data.collect().run() ``` ### Dealing with CSV files with multiple columns When the dataset has many columns, it is impractical to define a case class that contains many columns we don't need. In such case, we can project the columns we do need, cast them to the proper type, and then call `createUnsafe` using a case class that contains a much smaller subset of the columns. ```scala mdoc:nest import org.apache.spark.sql.types.DoubleType final case class IrisLight(kind: String, sLength: Double) val testDataDf = spark.read.format("csv").load(testDataPath) val projectedDf = testDataDf.select(testDataDf("_c4").as("kind"), testDataDf("_c1").cast(DoubleType).as("sLength")) val data = TypedDataset.createUnsafe[IrisLight](projectedDf) data.take(2).run() ``` ```scala mdoc:invisible spark.stop() ``` ## Working with Parquet ```scala mdoc:invisible:reset-object import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false") implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate() spark.sparkContext.setLogLevel("WARN") import spark.implicits._ val testDataPathParquet: String = "docs/iris.parquet" import frameless._ import frameless.syntax._ import frameless.functions.aggregate._ final case class Iris(sLength: Double, sWidth: Double, pLength: Double, pWidth: Double, kind: String) ``` Spark is much better at reading the schema from parquet files. ```scala mdoc val testDataParquet = spark.read.format("parquet").load(testDataPathParquet) testDataParquet.printSchema ``` So as long as we use a type (case class) that reflects the same number, type, and order of the fields from the data everything works as expected. ```scala mdoc:nest val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataParquet) data.take(2).run() ``` ### Dealing with Parquet files with multiple columns The main difference compared to CSV is that with Parquet Spark is better at inferring the types. This makes it simpler to project the columns we need without having the cast the to the proper type. ```scala mdoc:nest final case class IrisLight(kind: String, sLength: Double) val projectedDf = testDataParquet.select("kind", "sLength") val data = TypedDataset.createUnsafe[IrisLight](projectedDf) data.take(2).run() ``` ```scala mdoc:invisible spark.stop() ``` ================================================ FILE: docs/directory.conf ================================================ laika.title = frameless laika.navigationOrder = [ README.md FeatureOverview.md TypedDatasetVsSparkDataset.md WorkingWithCsvParquetJson.md Injection.md Job.md Cats.md TypedML.md TypedDataFrame.md ] ================================================ FILE: docs/iris.data ================================================ 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.4,3.7,1.5,0.2,Iris-setosa 4.8,3.4,1.6,0.2,Iris-setosa 4.8,3.0,1.4,0.1,Iris-setosa 4.3,3.0,1.1,0.1,Iris-setosa 5.8,4.0,1.2,0.2,Iris-setosa 5.7,4.4,1.5,0.4,Iris-setosa 5.4,3.9,1.3,0.4,Iris-setosa 5.1,3.5,1.4,0.3,Iris-setosa 5.7,3.8,1.7,0.3,Iris-setosa 5.1,3.8,1.5,0.3,Iris-setosa 5.4,3.4,1.7,0.2,Iris-setosa 5.1,3.7,1.5,0.4,Iris-setosa 4.6,3.6,1.0,0.2,Iris-setosa 5.1,3.3,1.7,0.5,Iris-setosa 4.8,3.4,1.9,0.2,Iris-setosa 5.0,3.0,1.6,0.2,Iris-setosa 5.0,3.4,1.6,0.4,Iris-setosa 5.2,3.5,1.5,0.2,Iris-setosa 5.2,3.4,1.4,0.2,Iris-setosa 4.7,3.2,1.6,0.2,Iris-setosa 4.8,3.1,1.6,0.2,Iris-setosa 5.4,3.4,1.5,0.4,Iris-setosa 5.2,4.1,1.5,0.1,Iris-setosa 5.5,4.2,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.0,3.2,1.2,0.2,Iris-setosa 5.5,3.5,1.3,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 4.4,3.0,1.3,0.2,Iris-setosa 5.1,3.4,1.5,0.2,Iris-setosa 5.0,3.5,1.3,0.3,Iris-setosa 4.5,2.3,1.3,0.3,Iris-setosa 4.4,3.2,1.3,0.2,Iris-setosa 5.0,3.5,1.6,0.6,Iris-setosa 5.1,3.8,1.9,0.4,Iris-setosa 4.8,3.0,1.4,0.3,Iris-setosa 5.1,3.8,1.6,0.2,Iris-setosa 4.6,3.2,1.4,0.2,Iris-setosa 5.3,3.7,1.5,0.2,Iris-setosa 5.0,3.3,1.4,0.2,Iris-setosa 7.0,3.2,4.7,1.4,Iris-versicolor 6.4,3.2,4.5,1.5,Iris-versicolor 6.9,3.1,4.9,1.5,Iris-versicolor 5.5,2.3,4.0,1.3,Iris-versicolor 6.5,2.8,4.6,1.5,Iris-versicolor 5.7,2.8,4.5,1.3,Iris-versicolor 6.3,3.3,4.7,1.6,Iris-versicolor 4.9,2.4,3.3,1.0,Iris-versicolor 6.6,2.9,4.6,1.3,Iris-versicolor 5.2,2.7,3.9,1.4,Iris-versicolor 5.0,2.0,3.5,1.0,Iris-versicolor 5.9,3.0,4.2,1.5,Iris-versicolor 6.0,2.2,4.0,1.0,Iris-versicolor 6.1,2.9,4.7,1.4,Iris-versicolor 5.6,2.9,3.6,1.3,Iris-versicolor 6.7,3.1,4.4,1.4,Iris-versicolor 5.6,3.0,4.5,1.5,Iris-versicolor 5.8,2.7,4.1,1.0,Iris-versicolor 6.2,2.2,4.5,1.5,Iris-versicolor 5.6,2.5,3.9,1.1,Iris-versicolor 5.9,3.2,4.8,1.8,Iris-versicolor 6.1,2.8,4.0,1.3,Iris-versicolor 6.3,2.5,4.9,1.5,Iris-versicolor 6.1,2.8,4.7,1.2,Iris-versicolor 6.4,2.9,4.3,1.3,Iris-versicolor 6.6,3.0,4.4,1.4,Iris-versicolor 6.8,2.8,4.8,1.4,Iris-versicolor 6.7,3.0,5.0,1.7,Iris-versicolor 6.0,2.9,4.5,1.5,Iris-versicolor 5.7,2.6,3.5,1.0,Iris-versicolor 5.5,2.4,3.8,1.1,Iris-versicolor 5.5,2.4,3.7,1.0,Iris-versicolor 5.8,2.7,3.9,1.2,Iris-versicolor 6.0,2.7,5.1,1.6,Iris-versicolor 5.4,3.0,4.5,1.5,Iris-versicolor 6.0,3.4,4.5,1.6,Iris-versicolor 6.7,3.1,4.7,1.5,Iris-versicolor 6.3,2.3,4.4,1.3,Iris-versicolor 5.6,3.0,4.1,1.3,Iris-versicolor 5.5,2.5,4.0,1.3,Iris-versicolor 5.5,2.6,4.4,1.2,Iris-versicolor 6.1,3.0,4.6,1.4,Iris-versicolor 5.8,2.6,4.0,1.2,Iris-versicolor 5.0,2.3,3.3,1.0,Iris-versicolor 5.6,2.7,4.2,1.3,Iris-versicolor 5.7,3.0,4.2,1.2,Iris-versicolor 5.7,2.9,4.2,1.3,Iris-versicolor 6.2,2.9,4.3,1.3,Iris-versicolor 5.1,2.5,3.0,1.1,Iris-versicolor 5.7,2.8,4.1,1.3,Iris-versicolor 6.3,3.3,6.0,2.5,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 7.1,3.0,5.9,2.1,Iris-virginica 6.3,2.9,5.6,1.8,Iris-virginica 6.5,3.0,5.8,2.2,Iris-virginica 7.6,3.0,6.6,2.1,Iris-virginica 4.9,2.5,4.5,1.7,Iris-virginica 7.3,2.9,6.3,1.8,Iris-virginica 6.7,2.5,5.8,1.8,Iris-virginica 7.2,3.6,6.1,2.5,Iris-virginica 6.5,3.2,5.1,2.0,Iris-virginica 6.4,2.7,5.3,1.9,Iris-virginica 6.8,3.0,5.5,2.1,Iris-virginica 5.7,2.5,5.0,2.0,Iris-virginica 5.8,2.8,5.1,2.4,Iris-virginica 6.4,3.2,5.3,2.3,Iris-virginica 6.5,3.0,5.5,1.8,Iris-virginica 7.7,3.8,6.7,2.2,Iris-virginica 7.7,2.6,6.9,2.3,Iris-virginica 6.0,2.2,5.0,1.5,Iris-virginica 6.9,3.2,5.7,2.3,Iris-virginica 5.6,2.8,4.9,2.0,Iris-virginica 7.7,2.8,6.7,2.0,Iris-virginica 6.3,2.7,4.9,1.8,Iris-virginica 6.7,3.3,5.7,2.1,Iris-virginica 7.2,3.2,6.0,1.8,Iris-virginica 6.2,2.8,4.8,1.8,Iris-virginica 6.1,3.0,4.9,1.8,Iris-virginica 6.4,2.8,5.6,2.1,Iris-virginica 7.2,3.0,5.8,1.6,Iris-virginica 7.4,2.8,6.1,1.9,Iris-virginica 7.9,3.8,6.4,2.0,Iris-virginica 6.4,2.8,5.6,2.2,Iris-virginica 6.3,2.8,5.1,1.5,Iris-virginica 6.1,2.6,5.6,1.4,Iris-virginica 7.7,3.0,6.1,2.3,Iris-virginica 6.3,3.4,5.6,2.4,Iris-virginica 6.4,3.1,5.5,1.8,Iris-virginica 6.0,3.0,4.8,1.8,Iris-virginica 6.9,3.1,5.4,2.1,Iris-virginica 6.7,3.1,5.6,2.4,Iris-virginica 6.9,3.1,5.1,2.3,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 6.8,3.2,5.9,2.3,Iris-virginica 6.7,3.3,5.7,2.5,Iris-virginica 6.7,3.0,5.2,2.3,Iris-virginica 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica 5.9,3.0,5.1,1.8,Iris-virginica ================================================ FILE: github.sbt ================================================ ThisBuild / githubWorkflowArtifactUpload := false // doesn't work with scoverage ThisBuild / githubWorkflowEnv += "SPARK_LOCAL_IP" -> "localhost" ThisBuild / githubWorkflowArtifactDownloadExtraKeys += "project" ThisBuild / githubWorkflowBuildSbtStepPreamble += s"project $${{ matrix.project }}" ThisBuild / tlCiScalafmtCheck := true ThisBuild / githubWorkflowBuild ~= { steps => steps.map { // replace the test step case step: WorkflowStep.Sbt if step.commands == List("test") => WorkflowStep.Sbt( commands = List("coverage", "test", "test/coverageReport"), name = Some("Test & Compute Coverage") ) case step => step } } ThisBuild / githubWorkflowBuildPostamble += WorkflowStep.Use( UseRef.Public( "codecov", "codecov-action", "v3" ), params = Map("flags" -> s"$${{ matrix.scala }}-$${{ matrix.project }}") ) ================================================ FILE: ml/src/main/scala/frameless/ml/TypedEstimator.scala ================================================ package frameless package ml import frameless.ops.SmartProject import org.apache.spark.ml.{Estimator, Model} /** * A TypedEstimator fits models to data. */ trait TypedEstimator[Inputs, Outputs, M <: Model[M]] { val estimator: Estimator[M] def fit[T, F[_]](ds: TypedDataset[T])( implicit smartProject: SmartProject[T, Inputs], F: SparkDelay[F] ): F[AppendTransformer[Inputs, Outputs, M]] = { implicit val sparkSession = ds.dataset.sparkSession F.delay { val inputDs = smartProject.apply(ds) val model = estimator.fit(inputDs.dataset) new AppendTransformer[Inputs, Outputs, M] { val transformer: M = model } } } } ================================================ FILE: ml/src/main/scala/frameless/ml/TypedTransformer.scala ================================================ package frameless package ml import frameless.ops.SmartProject import org.apache.spark.ml.Transformer import shapeless.{Generic, HList} import shapeless.ops.hlist.{Prepend, Tupler} /** * A TypedTransformer transforms one TypedDataset into another. */ sealed trait TypedTransformer /** * An AppendTransformer `transform` method takes as input a TypedDataset containing `Inputs` and * return a TypedDataset with `Outputs` columns appended to the input TypedDataset. */ trait AppendTransformer[Inputs, Outputs, InnerTransformer <: Transformer] extends TypedTransformer { val transformer: InnerTransformer def transform[T, TVals <: HList, OutputsVals <: HList, OutVals <: HList, Out](ds: TypedDataset[T])( implicit i0: SmartProject[T, Inputs], i1: Generic.Aux[T, TVals], i2: Generic.Aux[Outputs, OutputsVals], i3: Prepend.Aux[TVals, OutputsVals, OutVals], i4: Tupler.Aux[OutVals, Out], i5: TypedEncoder[Out] ): TypedDataset[Out] = { val transformed = transformer.transform(ds.dataset).as[Out](TypedExpressionEncoder[Out]) TypedDataset.create[Out](transformed) } } object AppendTransformer { // Random name to a temp column added by a TypedTransformer (the proper name will be given by the Tuple-based encoder) private[ml] val tempColumnName = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI" private[ml] val tempColumnName2 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMJ" private[ml] val tempColumnName3 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMK" } ================================================ FILE: ml/src/main/scala/frameless/ml/classification/TypedRandomForestClassifier.scala ================================================ package frameless package ml package classification import frameless.ml.internals.TreesInputsChecker import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} import org.apache.spark.ml.linalg.Vector /** * Random Forest learning algorithm for * classification. * It supports both binary and multiclass labels, as well as both continuous and categorical * features. */ final class TypedRandomForestClassifier[Inputs] private[ml]( rf: RandomForestClassifier, labelCol: String, featuresCol: String ) extends TypedEstimator[Inputs, TypedRandomForestClassifier.Outputs, RandomForestClassificationModel] { val estimator: RandomForestClassifier = rf .setLabelCol(labelCol) .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) .setRawPredictionCol(AppendTransformer.tempColumnName2) .setProbabilityCol(AppendTransformer.tempColumnName3) def setNumTrees(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setNumTrees(value)) def setMaxDepth(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxDepth(value)) def setMinInfoGain(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInfoGain(value)) def setMinInstancesPerNode(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInstancesPerNode(value)) def setMaxMemoryInMB(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxMemoryInMB(value)) def setSubsamplingRate(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setSubsamplingRate(value)) def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestClassifier[Inputs] = copy(rf.setFeatureSubsetStrategy(value.sparkValue)) def setMaxBins(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxBins(value)) private def copy(newRf: RandomForestClassifier): TypedRandomForestClassifier[Inputs] = new TypedRandomForestClassifier[Inputs](newRf, labelCol, featuresCol) } object TypedRandomForestClassifier { case class Outputs(rawPrediction: Vector, probability: Vector, prediction: Double) def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs]): TypedRandomForestClassifier[Inputs] = { new TypedRandomForestClassifier(new RandomForestClassifier(), inputsChecker.labelCol, inputsChecker.featuresCol) } } ================================================ FILE: ml/src/main/scala/frameless/ml/clustering/TypedBisectingKMeans.scala ================================================ package frameless package ml package classification import frameless.ml.internals.VectorInputsChecker import org.apache.spark.ml.clustering.{BisectingKMeans, BisectingKMeansModel} /** * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" * by Steinbach, Karypis, and Kumar, with modification to fit Spark. * The algorithm starts from a single cluster that contains all points. * Iteratively it finds divisible clusters on the bottom level and bisects each of them using * k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible. * The bisecting steps of clusters on the same level are grouped together to increase parallelism. * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters, * larger clusters get higher priority. * * @see * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques, * KDD Workshop on Text Mining, 2000. */ class TypedBisectingKMeans[Inputs] private[ml] ( bkm: BisectingKMeans, featuresCol: String ) extends TypedEstimator[Inputs,TypedBisectingKMeans.Output, BisectingKMeansModel]{ val estimator: BisectingKMeans = bkm .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) def setK(value: Int): TypedBisectingKMeans[Inputs] = copy(bkm.setK(value)) def setMaxIter(value: Int): TypedBisectingKMeans[Inputs] = copy(bkm.setMaxIter(value)) def setMinDivisibleClusterSize(value: Double): TypedBisectingKMeans[Inputs] = copy(bkm.setMinDivisibleClusterSize(value)) def setSeed(value: Long): TypedBisectingKMeans[Inputs] = copy(bkm.setSeed(value)) private def copy(newBkm: BisectingKMeans): TypedBisectingKMeans[Inputs] = new TypedBisectingKMeans[Inputs](newBkm, featuresCol) } object TypedBisectingKMeans { case class Output(prediction: Int) def apply[Inputs]()(implicit inputsChecker: VectorInputsChecker[Inputs]): TypedBisectingKMeans[Inputs] = new TypedBisectingKMeans(new BisectingKMeans(), inputsChecker.featuresCol) } ================================================ FILE: ml/src/main/scala/frameless/ml/clustering/TypedKMeans.scala ================================================ package frameless package ml package classification import frameless.ml.internals.VectorInputsChecker import frameless.ml.params.kmeans.KMeansInitMode import org.apache.spark.ml.clustering.{KMeans, KMeansModel} /** * K-means clustering with support for k-means|| initialization proposed by Bahmani et al. * * @see Bahmani et al., Scalable k-means++. */ class TypedKMeans[Inputs] private[ml] ( km: KMeans, featuresCol: String ) extends TypedEstimator[Inputs,TypedKMeans.Output,KMeansModel] { val estimator: KMeans = km .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) def setK(value: Int): TypedKMeans[Inputs] = copy(km.setK(value)) def setInitMode(value: KMeansInitMode): TypedKMeans[Inputs] = copy(km.setInitMode(value.sparkValue)) def setInitSteps(value: Int): TypedKMeans[Inputs] = copy(km.setInitSteps(value)) def setMaxIter(value: Int): TypedKMeans[Inputs] = copy(km.setMaxIter(value)) def setTol(value: Double): TypedKMeans[Inputs] = copy(km.setTol(value)) def setSeed(value: Long): TypedKMeans[Inputs] = copy(km.setSeed(value)) private def copy(newKmeans: KMeans): TypedKMeans[Inputs] = new TypedKMeans[Inputs](newKmeans, featuresCol) } object TypedKMeans{ case class Output(prediction: Int) def apply[Inputs](implicit inputsChecker: VectorInputsChecker[Inputs]): TypedKMeans[Inputs] = { new TypedKMeans(new KMeans(), inputsChecker.featuresCol) } } ================================================ FILE: ml/src/main/scala/frameless/ml/feature/TypedIndexToString.scala ================================================ package frameless package ml package feature import frameless.ml.internals.UnaryInputsChecker import org.apache.spark.ml.feature.IndexToString /** * A `TypedTransformer` that maps a column of indices back to a new column of corresponding * string values. * The index-string mapping must be supplied when creating the `TypedIndexToString`. * * @see `TypedStringIndexer` for converting strings into indices */ final class TypedIndexToString[Inputs] private[ml](indexToString: IndexToString, inputCol: String) extends AppendTransformer[Inputs, TypedIndexToString.Outputs, IndexToString] { val transformer: IndexToString = indexToString .setInputCol(inputCol) .setOutputCol(AppendTransformer.tempColumnName) } object TypedIndexToString { case class Outputs(originalOutput: String) def apply[Inputs](labels: Array[String]) (implicit inputsChecker: UnaryInputsChecker[Inputs, Double]): TypedIndexToString[Inputs] = { new TypedIndexToString[Inputs](new IndexToString().setLabels(labels), inputsChecker.inputCol) } } ================================================ FILE: ml/src/main/scala/frameless/ml/feature/TypedStringIndexer.scala ================================================ package frameless package ml package feature import frameless.ml.feature.TypedStringIndexer.HandleInvalid import frameless.ml.internals.UnaryInputsChecker import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel} /** * A label indexer that maps a string column of labels to an ML column of label indices. * The indices are in [0, numLabels), ordered by label frequencies. * So the most frequent label gets index 0. * * @see `TypedIndexToString` for the inverse transformation */ final class TypedStringIndexer[Inputs] private[ml](stringIndexer: StringIndexer, inputCol: String) extends TypedEstimator[Inputs, TypedStringIndexer.Outputs, StringIndexerModel] { val estimator: StringIndexer = stringIndexer .setInputCol(inputCol) .setOutputCol(AppendTransformer.tempColumnName) def setHandleInvalid(value: HandleInvalid): TypedStringIndexer[Inputs] = copy(stringIndexer.setHandleInvalid(value.sparkValue)) private def copy(newStringIndexer: StringIndexer): TypedStringIndexer[Inputs] = new TypedStringIndexer[Inputs](newStringIndexer, inputCol) } object TypedStringIndexer { case class Outputs(indexedOutput: Double) sealed abstract class HandleInvalid(val sparkValue: String) object HandleInvalid { case object Error extends HandleInvalid("error") case object Skip extends HandleInvalid("skip") case object Keep extends HandleInvalid("keep") } def apply[Inputs](implicit inputsChecker: UnaryInputsChecker[Inputs, String]): TypedStringIndexer[Inputs] = { new TypedStringIndexer[Inputs](new StringIndexer(), inputsChecker.inputCol) } } ================================================ FILE: ml/src/main/scala/frameless/ml/feature/TypedVectorAssembler.scala ================================================ package frameless package ml package feature import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import shapeless.{HList, HNil, LabelledGeneric} import shapeless.ops.hlist.ToTraversable import shapeless.ops.record.{Keys, Values} import shapeless._ import scala.annotation.implicitNotFound /** * A feature transformer that merges multiple columns into a vector column. */ final class TypedVectorAssembler[Inputs] private[ml](vectorAssembler: VectorAssembler, inputCols: Array[String]) extends AppendTransformer[Inputs, TypedVectorAssembler.Output, VectorAssembler] { val transformer: VectorAssembler = vectorAssembler .setInputCols(inputCols) .setOutputCol(AppendTransformer.tempColumnName) } object TypedVectorAssembler { case class Output(vector: Vector) def apply[Inputs](implicit inputsChecker: TypedVectorAssemblerInputsChecker[Inputs]): TypedVectorAssembler[Inputs] = { new TypedVectorAssembler(new VectorAssembler(), inputsChecker.inputCols.toArray) } } @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. Input type must only contain fields of numeric or boolean types." ) private[ml] trait TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] } private[ml] object TypedVectorAssemblerInputsChecker { implicit def checkInputs[Inputs, InputsRec <: HList, InputsKeys <: HList, InputsVals <: HList]( implicit inputsGen: LabelledGeneric.Aux[Inputs, InputsRec], inputsKeys: Keys.Aux[InputsRec, InputsKeys], inputsKeysTraverse: ToTraversable.Aux[InputsKeys, Seq, Symbol], inputsValues: Values.Aux[InputsRec, InputsVals], inputsTypeCheck: TypedVectorAssemblerInputsValueChecker[InputsVals] ): TypedVectorAssemblerInputsChecker[Inputs] = new TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] = inputsKeys.apply().to[Seq].map(_.name) } } private[ml] trait TypedVectorAssemblerInputsValueChecker[InputsVals] private[ml] object TypedVectorAssemblerInputsValueChecker { implicit def hnilCheckInputsValue: TypedVectorAssemblerInputsValueChecker[HNil] = new TypedVectorAssemblerInputsValueChecker[HNil] {} implicit def hlistCheckInputsValueNumeric[H, T <: HList]( implicit ch: CatalystNumeric[H], tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[H :: T] = new TypedVectorAssemblerInputsValueChecker[H :: T] {} implicit def hlistCheckInputsValueBoolean[T <: HList]( implicit tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[Boolean :: T] = new TypedVectorAssemblerInputsValueChecker[Boolean :: T] {} } ================================================ FILE: ml/src/main/scala/frameless/ml/internals/LinearInputsChecker.scala ================================================ package frameless package ml package internals import org.apache.spark.ml.linalg._ import shapeless.ops.hlist.Length import shapeless.{HList, LabelledGeneric, Nat, Witness} import scala.annotation.implicitNotFound /** * Can be used for linear reg algorithm */ @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. " + "Input type must only contain a field of type Double (the label) and a field of type " + "org.apache.spark.ml.linalg.Vector (the features) and optional field of float type (weight)." ) trait LinearInputsChecker[Inputs] { val featuresCol: String val labelCol: String val weightCol: Option[String] } object LinearInputsChecker { implicit def checkLinearInputs[ Inputs, InputsRec <: HList, LabelK <: Symbol, FeaturesK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._2], i2: SelectorByValue.Aux[InputsRec, Double, LabelK], i3: Witness.Aux[LabelK], i4: SelectorByValue.Aux[InputsRec, Vector, FeaturesK], i5: Witness.Aux[FeaturesK] ): LinearInputsChecker[Inputs] = { new LinearInputsChecker[Inputs] { val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name val weightCol: Option[String] = None } } implicit def checkLinearInputs2[ Inputs, InputsRec <: HList, LabelK <: Symbol, FeaturesK <: Symbol, WeightK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._3], i2: SelectorByValue.Aux[InputsRec, Vector, FeaturesK], i3: Witness.Aux[FeaturesK], i4: SelectorByValue.Aux[InputsRec, Double, LabelK], i5: Witness.Aux[LabelK], i6: SelectorByValue.Aux[InputsRec, Float, WeightK], i7: Witness.Aux[WeightK] ): LinearInputsChecker[Inputs] = { new LinearInputsChecker[Inputs] { val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name val weightCol: Option[String] = Some(implicitly[Witness.Aux[WeightK]].value.name) } } } ================================================ FILE: ml/src/main/scala/frameless/ml/internals/SelectorByValue.scala ================================================ package frameless package ml package internals import shapeless.labelled.FieldType import shapeless.{::, DepFn1, HList, Witness} /** * Typeclass supporting record selection by value type (returning the first key whose value is of type `Value`) */ trait SelectorByValue[L <: HList, Value] extends DepFn1[L] with Serializable { type Out <: Symbol } object SelectorByValue { type Aux[L <: HList, Value, Out0 <: Symbol] = SelectorByValue[L, Value] { type Out = Out0 } implicit def select[K <: Symbol, T <: HList, Value](implicit wk: Witness.Aux[K]): Aux[FieldType[K, Value] :: T, Value, K] = { new SelectorByValue[FieldType[K, Value] :: T, Value] { type Out = K def apply(l: FieldType[K, Value] :: T): Out = wk.value } } implicit def recurse[H, T <: HList, Value](implicit st: SelectorByValue[T, Value]): Aux[H :: T, Value, st.Out] = { new SelectorByValue[H :: T, Value] { type Out = st.Out def apply(l: H :: T): Out = st(l.tail) } } } ================================================ FILE: ml/src/main/scala/frameless/ml/internals/TreesInputsChecker.scala ================================================ package frameless package ml package internals import shapeless.ops.hlist.Length import shapeless.{HList, LabelledGeneric, Nat, Witness} import org.apache.spark.ml.linalg._ import scala.annotation.implicitNotFound /** * Can be used for all tree-based ML algorithm (decision tree, random forest, gradient-boosted trees) */ @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. " + "Input type must only contain a field of type Double (the label) and a field of type " + "org.apache.spark.ml.linalg.Vector (the features)." ) trait TreesInputsChecker[Inputs] { val featuresCol: String val labelCol: String } object TreesInputsChecker { implicit def checkTreesInputs[ Inputs, InputsRec <: HList, LabelK <: Symbol, FeaturesK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._2], i2: SelectorByValue.Aux[InputsRec, Double, LabelK], i3: Witness.Aux[LabelK], i4: SelectorByValue.Aux[InputsRec, Vector, FeaturesK], i5: Witness.Aux[FeaturesK] ): TreesInputsChecker[Inputs] = { new TreesInputsChecker[Inputs] { val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name } } } ================================================ FILE: ml/src/main/scala/frameless/ml/internals/UnaryInputsChecker.scala ================================================ package frameless package ml package internals import shapeless.ops.hlist.Length import shapeless.{HList, LabelledGeneric, Nat, Witness} import scala.annotation.implicitNotFound /** * Can be used for all unary transformers (i.e almost all of them) */ @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. Input type must have only one field of type ${Expected}" ) trait UnaryInputsChecker[Inputs, Expected] { val inputCol: String } object UnaryInputsChecker { implicit def checkUnaryInputs[Inputs, Expected, InputsRec <: HList, InputK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._1], i2: SelectorByValue.Aux[InputsRec, Expected, InputK], i3: Witness.Aux[InputK] ): UnaryInputsChecker[Inputs, Expected] = new UnaryInputsChecker[Inputs, Expected] { val inputCol: String = implicitly[Witness.Aux[InputK]].value.name } } ================================================ FILE: ml/src/main/scala/frameless/ml/internals/VectorInputsChecker.scala ================================================ package frameless package ml package internals import shapeless.ops.hlist.Length import shapeless.{HList, LabelledGeneric, Nat, Witness} import scala.annotation.implicitNotFound import org.apache.spark.ml.linalg.Vector /** Can be used whenever algorithm requires only vector */ @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. " + "Input type must only contain a field of type org.apache.spark.ml.linalg.Vector (the features)." ) trait VectorInputsChecker[Inputs] { val featuresCol: String } object VectorInputsChecker { implicit def checkVectorInput[Inputs, InputsRec <: HList, FeaturesK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._1], i2: SelectorByValue.Aux[InputsRec, Vector, FeaturesK], i3: Witness.Aux[FeaturesK] ): VectorInputsChecker[Inputs] = { new VectorInputsChecker[Inputs] { val featuresCol: String = i3.value.name } } } ================================================ FILE: ml/src/main/scala/frameless/ml/package.scala ================================================ package frameless import org.apache.spark.sql.FramelessInternals.UserDefinedType import org.apache.spark.ml.FramelessInternals import org.apache.spark.ml.linalg.{Matrix, Vector} package object ml { implicit val mlVectorUdt: UserDefinedType[Vector] = FramelessInternals.vectorUdt implicit val mlMatrixUdt: UserDefinedType[Matrix] = FramelessInternals.matrixUdt } ================================================ FILE: ml/src/main/scala/frameless/ml/params/kmeans/KMeansInitMode.scala ================================================ package frameless package ml package params package kmeans /** * Param for the initialization algorithm. * This can be either "random" to choose random points as * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ * (Bahmani et al., Scalable K-Means++, VLDB 2012). * Default: k-means||. */ sealed abstract class KMeansInitMode private[ml](val sparkValue: String) object KMeansInitMode { case object Random extends KMeansInitMode("random") case object KMeansPlusPlus extends KMeansInitMode("k-means||") } ================================================ FILE: ml/src/main/scala/frameless/ml/params/linears/LossStrategy.scala ================================================ package frameless package ml package params package linears /** * SquaredError measures the average of the squares of the errors—that is, * the average squared difference between the estimated values and what is estimated. * * Huber Loss loss function less sensitive to outliers in data than the * squared error loss */ sealed abstract class LossStrategy private[ml](val sparkValue: String) object LossStrategy { case object SquaredError extends LossStrategy("squaredError") case object Huber extends LossStrategy("huber") } ================================================ FILE: ml/src/main/scala/frameless/ml/params/linears/Solver.scala ================================================ package frameless package ml package params package linears /** * solver algorithm used for optimization. * - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton * optimization method. * - "normal" denotes using Normal Equation as an analytical solution to the linear regression * problem. This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`. * - "auto" (default) means that the solver algorithm is selected automatically. * The Normal Equations solver will be used when possible, but this will automatically fall * back to iterative optimization methods when needed. * * spark */ sealed abstract class Solver private[ml](val sparkValue: String) object Solver { case object LBFGS extends Solver("l-bfgs") case object Auto extends Solver("auto") case object Normal extends Solver("normal") } ================================================ FILE: ml/src/main/scala/frameless/ml/params/trees/FeatureSubsetStrategy.scala ================================================ package frameless package ml package params package trees /** * The number of features to consider for splits at each tree node. * Supported options: * - Auto: Choose automatically for task: * If numTrees == 1, set to All * If numTrees > 1 (forest), set to Sqrt for classification and * to OneThird for regression. * - All: use all features * - OneThird: use 1/3 of the features * - Sqrt: use sqrt(number of features) * - Log2: use log2(number of features) * - Ratio: use (ratio * number of features) features * - NumberOfFeatures: use numberOfFeatures features. * (default = Auto) * * These various settings are based on the following references: * - log2: tested in Breiman (2001) * - sqrt: recommended by Breiman manual for random forests * - The defaults of sqrt (classification) and onethird (regression) match the R randomForest * package. * * @see Breiman (2001) * @see * Breiman manual for random forests */ sealed abstract class FeatureSubsetStrategy private[ml](val sparkValue: String) object FeatureSubsetStrategy { case object Auto extends FeatureSubsetStrategy("auto") case object All extends FeatureSubsetStrategy("all") case object OneThird extends FeatureSubsetStrategy("onethird") case object Sqrt extends FeatureSubsetStrategy("sqrt") case object Log2 extends FeatureSubsetStrategy("log2") case class Ratio(value: Double) extends FeatureSubsetStrategy(value.toString) case class NumberOfFeatures(value: Int) extends FeatureSubsetStrategy(value.toString) } ================================================ FILE: ml/src/main/scala/frameless/ml/regression/TypedLinearRegression.scala ================================================ package frameless package ml package regression import frameless.ml.internals.LinearInputsChecker import frameless.ml.params.linears.{LossStrategy, Solver} import frameless.ml.{AppendTransformer, TypedEstimator} import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel} /** * Linear Regression linear approach to modelling the relationship * between a scalar response (or dependent variable) and one or more explanatory variables */ final class TypedLinearRegression [Inputs] private[ml]( lr: LinearRegression, labelCol: String, featuresCol: String, weightCol: Option[String] ) extends TypedEstimator[Inputs, TypedLinearRegression.Outputs, LinearRegressionModel] { val estimatorWithoutWeight : LinearRegression = lr .setLabelCol(labelCol) .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) val estimator = if (weightCol.isDefined) estimatorWithoutWeight.setWeightCol(weightCol.get) else estimatorWithoutWeight def setRegParam(value: Double): TypedLinearRegression[Inputs] = copy(lr.setRegParam(value)) def setFitIntercept(value: Boolean): TypedLinearRegression[Inputs] = copy(lr.setFitIntercept(value)) def setStandardization(value: Boolean): TypedLinearRegression[Inputs] = copy(lr.setStandardization(value)) def setElasticNetParam(value: Double): TypedLinearRegression[Inputs] = copy(lr.setElasticNetParam(value)) def setMaxIter(value: Int): TypedLinearRegression[Inputs] = copy(lr.setMaxIter(value)) def setTol(value: Double): TypedLinearRegression[Inputs] = copy(lr.setTol(value)) def setSolver(value: Solver): TypedLinearRegression[Inputs] = copy(lr.setSolver(value.sparkValue)) def setAggregationDepth(value: Int): TypedLinearRegression[Inputs] = copy(lr.setAggregationDepth(value)) def setLoss(value: LossStrategy): TypedLinearRegression[Inputs] = copy(lr.setLoss(value.sparkValue)) def setEpsilon(value: Double): TypedLinearRegression[Inputs] = copy(lr.setEpsilon(value)) private def copy(newLr: LinearRegression): TypedLinearRegression[Inputs] = new TypedLinearRegression[Inputs](newLr, labelCol, featuresCol, weightCol) } object TypedLinearRegression { case class Outputs(prediction: Double) case class Weight(weight: Double) def apply[Inputs](implicit inputsChecker: LinearInputsChecker[Inputs]): TypedLinearRegression[Inputs] = { new TypedLinearRegression(new LinearRegression(), inputsChecker.labelCol, inputsChecker.featuresCol, inputsChecker.weightCol) } } ================================================ FILE: ml/src/main/scala/frameless/ml/regression/TypedRandomForestRegressor.scala ================================================ package frameless package ml package regression import frameless.ml.internals.TreesInputsChecker import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor} /** * Random Forest * learning algorithm for regression. * It supports both continuous and categorical features. */ final class TypedRandomForestRegressor[Inputs] private[ml]( rf: RandomForestRegressor, labelCol: String, featuresCol: String ) extends TypedEstimator[Inputs, TypedRandomForestRegressor.Outputs, RandomForestRegressionModel] { val estimator: RandomForestRegressor = rf .setLabelCol(labelCol) .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) def setNumTrees(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setNumTrees(value)) def setMaxDepth(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxDepth(value)) def setMinInfoGain(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInfoGain(value)) def setMinInstancesPerNode(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInstancesPerNode(value)) def setMaxMemoryInMB(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxMemoryInMB(value)) def setSubsamplingRate(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setSubsamplingRate(value)) def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestRegressor[Inputs] = copy(rf.setFeatureSubsetStrategy(value.sparkValue)) def setMaxBins(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxBins(value)) private def copy(newRf: RandomForestRegressor): TypedRandomForestRegressor[Inputs] = new TypedRandomForestRegressor[Inputs](newRf, labelCol, featuresCol) } object TypedRandomForestRegressor { case class Outputs(prediction: Double) def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs]) : TypedRandomForestRegressor[Inputs] = { new TypedRandomForestRegressor(new RandomForestRegressor(), inputsChecker.labelCol, inputsChecker.featuresCol) } } ================================================ FILE: ml/src/main/scala/org/apache/spark/ml/FramelessInternals.scala ================================================ package org.apache.spark.ml import org.apache.spark.ml.linalg.{MatrixUDT, VectorUDT} object FramelessInternals { // because org.apache.spark.ml.linalg.VectorUDT is private[spark] val vectorUdt = new VectorUDT // because org.apache.spark.ml.linalg.MatrixUDT is private[spark] val matrixUdt = new MatrixUDT } ================================================ FILE: ml/src/test/scala/frameless/ml/FramelessMlSuite.scala ================================================ package frameless package ml import org.scalactic.anyvals.PosZInt import org.scalatest.BeforeAndAfterAll import org.scalatestplus.scalacheck.Checkers import org.scalatest.funsuite.AnyFunSuite class FramelessMlSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll with SparkTesting { // Limit size of generated collections and number of checks because Travis implicit override val generatorDrivenConfig = PropertyCheckConfiguration(sizeRange = PosZInt(10), minSize = PosZInt(10)) implicit val sparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob } ================================================ FILE: ml/src/test/scala/frameless/ml/Generators.scala ================================================ package frameless package ml import frameless.ml.params.linears.{LossStrategy, Solver} import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} import org.scalacheck.{Arbitrary, Gen} object Generators { implicit val arbVector: Arbitrary[Vector] = Arbitrary { val genDenseVector = Gen.listOf(arbDouble.arbitrary).suchThat(_.nonEmpty).map(doubles => Vectors.dense(doubles.toArray)) val genSparseVector = genDenseVector.map(_.toSparse) Gen.oneOf(genDenseVector, genSparseVector) } implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary { Gen.sized { size => for { nbRows <- Gen.choose(0, size) nbCols <- Gen.choose(1, size) matrix <- { Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary) .map(values => Matrices.dense(nbRows, nbCols, values.toArray)) } } yield matrix } } implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary { val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio) val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures) Gen.oneOf(Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.Log2), Gen.const(FeatureSubsetStrategy.OneThird), Gen.const(FeatureSubsetStrategy.Sqrt), genRatio, genNumberOfFeatures ) } implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary { Gen.const(LossStrategy.SquaredError) } implicit val arbSolver: Arbitrary[Solver] = Arbitrary { Gen.oneOf( Gen.const(Solver.LBFGS), Gen.const(Solver.Auto), Gen.const(Solver.Normal) ) } } ================================================ FILE: ml/src/test/scala/frameless/ml/TypedEncoderInstancesTests.scala ================================================ package frameless package ml import org.scalacheck.Prop._ import org.apache.spark.ml.linalg._ import org.apache.spark.ml.regression.DecisionTreeRegressor import Generators._ import scala.util.Random class TypedEncoderInstancesTests extends FramelessMlSuite { test("Vector encoding is injective using collect()") { val prop = forAll { vector: Vector => TypedDataset.create(Seq(vector)).collect().run() == Seq(vector) } check(prop) } test("Matrix encoding is injective using collect()") { val prop = forAll { matrix: Matrix => TypedDataset.create(Seq(matrix)).collect().run() == Seq(matrix) } check(prop) } test("Vector is encoded as VectorUDT and thus can be run in a Spark ML model") { case class Input(features: Vector, label: Double) val prop = forAll { trainingData: Matrix => (trainingData.numRows >= 1) ==> { val inputs = trainingData.rowIter.toVector.map(vector => Input(vector, 0D)) val inputsDS = TypedDataset.create(inputs) val model = new DecisionTreeRegressor() // this line would throw a runtime exception if Vector was not encoded as VectorUDT val trainedModel = model.fit(inputsDS.dataset) val randomInput = inputs(Random.nextInt(inputs.length)) val randomInputDS = TypedDataset.create(Seq(randomInput)) val prediction = trainedModel.transform(randomInputDS.dataset) .select("prediction") .head() .getAs[Double](0) prediction == 0D } } check(prop, MinSize(1)) } } ================================================ FILE: ml/src/test/scala/frameless/ml/classification/ClassificationIntegrationTests.scala ================================================ package frameless package ml package classification import frameless.ml.feature.{TypedIndexToString, TypedStringIndexer, TypedVectorAssembler} import org.apache.spark.ml.linalg.Vector import org.scalatest.matchers.must.Matchers class ClassificationIntegrationTests extends FramelessMlSuite with Matchers { test("predict field3 from field1 and field2 using a RandomForestClassifier") { case class Data(field1: Double, field2: Int, field3: String) // Training val trainingDataDs = TypedDataset.create(Seq.fill(10)(Data(0D, 10, "foo"))) case class Features(field1: Double, field2: Int) val vectorAssembler = TypedVectorAssembler[Features] case class DataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[DataWithFeatures]() case class StringIndexerInput(field3: String) val indexer = TypedStringIndexer[StringIndexerInput] val indexerModel = indexer.fit(dataWithFeatures).run() case class IndexedDataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector, indexedField3: Double) val indexedData = indexerModel.transform(dataWithFeatures).as[IndexedDataWithFeatures]() case class RFInputs(indexedField3: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] val model = rf.fit(indexedData).run() // Prediction val testData = TypedDataset.create(Seq( Data(0D, 10, "foo") )) val testDataWithFeatures = vectorAssembler.transform(testData).as[DataWithFeatures]() val indexedTestData = indexerModel.transform(testDataWithFeatures).as[IndexedDataWithFeatures]() case class PredictionInputs(features: Vector, indexedField3: Double) val testInput = indexedTestData.project[PredictionInputs] case class PredictionResultIndexed( features: Vector, indexedField3: Double, rawPrediction: Vector, probability: Vector, predictedField3Indexed: Double ) val predictionDs = model.transform(testInput).as[PredictionResultIndexed]() case class IndexToStringInput(predictedField3Indexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labelsArray.flatten) case class PredictionResult( features: Vector, indexedField3: Double, rawPrediction: Vector, probability: Vector, predictedField3Indexed: Double, predictedField3: String ) val stringPredictionDs = indexToString.transform(predictionDs).as[PredictionResult]() val prediction = stringPredictionDs.select(stringPredictionDs.col('predictedField3)).collect().run().toList prediction mustEqual List("foo") } } ================================================ FILE: ml/src/test/scala/frameless/ml/classification/TypedRandomForestClassifierTests.scala ================================================ package frameless package ml package classification import shapeless.test.illTyped import org.apache.spark.ml.linalg._ import frameless.ml.params.trees.FeatureSubsetStrategy import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Prop._ import org.scalatest.matchers.must.Matchers class TypedRandomForestClassifierTests extends FramelessMlSuite with Matchers { implicit val arbDouble: Arbitrary[Double] = Arbitrary(Gen.choose(1, 99).map(_.toDouble)) // num classes must be between 0 and 100 for the test implicit val arbVectorNonEmpty: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary suchThat (_.size > 0)) // vector must not be empty for RandomForestClassifier import Generators.arbTreesFeaturesSubsetStrategy test("fit() returns a correct TypedTransformer") { val prop = forAll { x2: X2[Double, Vector] => val rf = TypedRandomForestClassifier[X2[Double, Vector]] val ds = TypedDataset.create(Seq(x2)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X5[Double, Vector, Vector, Vector, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } val prop2 = forAll { x2: X2[Vector, Double] => val rf = TypedRandomForestClassifier[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x2)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X5[Vector, Double, Vector, Vector, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] => val rf = TypedRandomForestClassifier[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x3)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X6[Vector, Double, A, Vector, Vector, Double]]() pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c)) } check(prop) check(prop2) check(prop3[String]) check(prop3[Double]) } test("param setting is retained") { val prop = forAll { featureSubsetStrategy: FeatureSubsetStrategy => val rf = TypedRandomForestClassifier[X2[Double, Vector]] .setNumTrees(10) .setMaxBins(100) .setFeatureSubsetStrategy(featureSubsetStrategy) .setMaxDepth(10) .setMaxMemoryInMB(100) .setMinInfoGain(0.1D) .setMinInstancesPerNode(2) .setSubsamplingRate(0.9D) val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D)))) val model = rf.fit(ds).run() model.transformer.getNumTrees == 10 && model.transformer.getMaxBins == 100 && model.transformer.getFeatureSubsetStrategy == featureSubsetStrategy.sparkValue && model.transformer.getMaxDepth == 10 && model.transformer.getMaxMemoryInMB == 100 && model.transformer.getMinInfoGain == 0.1D && model.transformer.getMinInstancesPerNode == 2 && model.transformer.getSubsamplingRate == 0.9D } check(prop) } test("create() compiles only with correct inputs") { illTyped("TypedRandomForestClassifier.create[Double]()") illTyped("TypedRandomForestClassifier.create[X1[Double]]()") illTyped("TypedRandomForestClassifier.create[X2[Double, Double]]()") illTyped("TypedRandomForestClassifier.create[X3[Vector, Double, Int]]()") illTyped("TypedRandomForestClassifier.create[X2[Vector, String]]()") } } ================================================ FILE: ml/src/test/scala/frameless/ml/clustering/BisectingKMeansTests.scala ================================================ package frameless package ml package clustering import frameless.{TypedDataset, TypedEncoder, X1, X2, X3} import frameless.ml.classification.TypedBisectingKMeans import org.scalacheck.Arbitrary import org.apache.spark.ml.linalg._ import org.scalacheck.Prop._ import frameless.ml._ import org.scalatest.matchers.must.Matchers class BisectingKMeansTests extends FramelessMlSuite with Matchers { implicit val arbVector: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary) test("fit() returns a correct TypedTransformer") { val prop = forAll { x1: X1[Vector] => val km = TypedBisectingKMeans[X1[Vector]]() val ds = TypedDataset.create(Seq(x1)) val model = km.fit(ds).run() val pDs = model.transform(ds).as[X2[Vector, Int]]() pDs.select(pDs.col('a)).collect().run().toList == Seq(x1.a) } def prop3[A: TypedEncoder : Arbitrary] = forAll { x2: X2[Vector, A] => val km = TypedBisectingKMeans[X1[Vector]]() val ds = TypedDataset.create(Seq(x2)) val model = km.fit(ds).run() val pDs = model.transform(ds).as[X3[Vector, A, Int]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq((x2.a, x2.b)) } check(prop) check(prop3[Double]) } test("param setting is retained") { val rf = TypedBisectingKMeans[X1[Vector]]() .setK(10) .setMaxIter(10) .setMinDivisibleClusterSize(1) .setSeed(123332) val ds = TypedDataset.create(Seq(X2(Vectors.dense(Array(0D)),0))) val model = rf.fit(ds).run() model.transformer.getK == 10 && model.transformer.getMaxIter == 10 && model.transformer.getMinDivisibleClusterSize == 1 && model.transformer.getSeed == 123332 } } ================================================ FILE: ml/src/test/scala/frameless/ml/clustering/ClusteringIntegrationTests.scala ================================================ package frameless package ml package clustering import frameless.ml.FramelessMlSuite import frameless.ml.classification.{TypedBisectingKMeans, TypedKMeans} import org.apache.spark.ml.linalg.Vector import frameless._ import frameless.ml._ import frameless.ml.feature._ import org.scalatest.matchers.must.Matchers class ClusteringIntegrationTests extends FramelessMlSuite with Matchers { test("predict field2 from field1 using a K-means clustering") { // Training val trainingDataDs = TypedDataset.create(Seq.fill(5)(X2(10D, 0)) :+ X2(100D,0)) val vectorAssembler = TypedVectorAssembler[X1[Double]] val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[X3[Double,Int,Vector]]() case class Input(c: Vector) val km = TypedKMeans[Input].setK(2) val model = km.fit(dataWithFeatures).run() // Prediction val testSeq = Seq( X2(10D, 0), X2(100D, 1) ) val testData = TypedDataset.create(testSeq) val testDataWithFeatures = vectorAssembler.transform(testData).as[X3[Double,Int,Vector]]() val predictionDs = model.transform(testDataWithFeatures).as[X4[Double,Int,Vector,Int]]() val prediction = predictionDs.select(predictionDs.col[Int]('d)).collect().run().toList prediction mustEqual testSeq.map(_.b) } test("predict field2 from field1 using a bisecting K-means clustering") { // Training val trainingDataDs = TypedDataset.create(Seq.fill(5)(X2(10D, 0)) :+ X2(100D,0)) val vectorAssembler = TypedVectorAssembler[X1[Double]] val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[X3[Double, Int, Vector]]() case class Inputs(c: Vector) val bkm = TypedBisectingKMeans[Inputs]().setK(2) val model = bkm.fit(dataWithFeatures).run() // Prediction val testSeq = Seq( X2(10D, 0), X2(100D, 1) ) val testData = TypedDataset.create(testSeq) val testDataWithFeatures = vectorAssembler.transform(testData).as[X3[Double, Int, Vector]]() val predictionDs = model.transform(testDataWithFeatures).as[X4[Double,Int,Vector,Int]]() val prediction = predictionDs.select(predictionDs.col[Int]('d)).collect().run().toList prediction mustEqual testSeq.map(_.b) } } ================================================ FILE: ml/src/test/scala/frameless/ml/clustering/KMeansTests.scala ================================================ package frameless package ml package clustering import frameless.ml.classification.TypedKMeans import frameless.{TypedDataset, TypedEncoder, X1, X2, X3} import org.apache.spark.ml.linalg._ import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Prop._ import frameless.ml._ import frameless.ml.params.kmeans.KMeansInitMode import org.scalatest.matchers.must.Matchers class KMeansTests extends FramelessMlSuite with Matchers { implicit val arbVector: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary) implicit val arbKMeansInitMode: Arbitrary[KMeansInitMode] = Arbitrary { Gen.oneOf( Gen.const(KMeansInitMode.KMeansPlusPlus), Gen.const(KMeansInitMode.Random) ) } /** * copies a vector as we need two rows of the right dimension for 3.4's alg */ def newRowWithSameDimension(vect: Vector): Vector = { val dubs = vect.toArray.map(_ % 2) // k is two val dense = Vectors.dense(dubs) vect match { case _: SparseVector => dense.toSparse case _ => dense } } test("fit() returns a correct TypedTransformer") { val prop = forAll { x1: X1[Vector] => val x1a = X1(newRowWithSameDimension(x1.a)) val km = TypedKMeans[X1[Vector]] val ds = TypedDataset.create(Seq(x1, x1a)) val model = km.fit(ds).run() val pDs = model.transform(ds).as[X2[Vector, Int]]() pDs.select(pDs.col('a)).collect().run().toList == Seq(x1.a, x1a.a) } def prop3[A: TypedEncoder : Arbitrary] = forAll { x2: X2[Vector, A] => val x2a = x2.copy(a = newRowWithSameDimension(x2.a)) val km = TypedKMeans[X1[Vector]] val ds = TypedDataset.create(Seq(x2, x2a)) val model = km.fit(ds).run() val pDs = model.transform(ds).as[X3[Vector, A, Int]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run().toList == Seq((x2.a, x2.b), (x2a.a, x2a.b)) } tolerantRun( _.isInstanceOf[ArrayIndexOutOfBoundsException] ) { check(prop) check(prop3[Double]) } } test("param setting is retained") { val prop = forAll { initMode: KMeansInitMode => val rf = TypedKMeans[X1[Vector]] .setInitMode(KMeansInitMode.Random) .setInitSteps(2) .setK(10) .setMaxIter(15) .setSeed(123223L) .setTol(12D) val ds = TypedDataset.create(Seq(X2(Vectors.dense(Array(0D)), 0))) val model = rf.fit(ds).run() model.transformer.getInitMode == KMeansInitMode.Random.sparkValue && model.transformer.getInitSteps == 2 && model.transformer.getK == 10 && model.transformer.getMaxIter == 15 && model.transformer.getSeed == 123223L && model.transformer.getTol == 12D } check(prop) } } ================================================ FILE: ml/src/test/scala/frameless/ml/feature/TypedIndexToStringTests.scala ================================================ package frameless package ml package feature import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Prop._ import shapeless.test.illTyped import org.scalatest.matchers.must.Matchers class TypedIndexToStringTests extends FramelessMlSuite with Matchers { test(".transform() correctly transform an input dataset") { implicit val arbDouble = Arbitrary(Gen.choose(0, 99).map(_.toDouble)) def prop[A: TypedEncoder: Arbitrary] = forAll { x2: X2[Double, A] => val transformer = TypedIndexToString[X1[Double]](Array.fill(100)("foo")) val ds = TypedDataset.create(Seq(x2)) val ds2 = transformer.transform(ds) ds2.collect().run() == Seq((x2.a, x2.b, "foo")) } check(prop[Double]) check(prop[String]) } test("create() compiles only with correct inputs") { illTyped("TypedIndexToString.create[String](Array(\"foo\"))") illTyped("TypedIndexToString.create[X1[String]](Array(\"foo\"))") illTyped("TypedIndexToString.create[X1[Long]](Array(\"foo\"))") illTyped("TypedIndexToString.create[X2[String, Int]](Array(\"foo\"))") } } ================================================ FILE: ml/src/test/scala/frameless/ml/feature/TypedStringIndexerTests.scala ================================================ package frameless package ml package feature import frameless.ml.feature.TypedStringIndexer.HandleInvalid import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Prop._ import shapeless.test.illTyped import org.scalatest.matchers.must.Matchers class TypedStringIndexerTests extends FramelessMlSuite with Matchers { test(".fit() returns a correct TypedTransformer") { def prop[A: TypedEncoder : Arbitrary] = forAll { x2: X2[String, A] => val indexer = TypedStringIndexer[X1[String]] val ds = TypedDataset.create(Seq(x2)) val model = indexer.fit(ds).run() val resultDs = model.transform(ds).as[X3[String, A, Double]]() resultDs.collect().run() == Seq(X3(x2.a, x2.b, 0D)) } check(prop[Double]) check(prop[String]) } test("param setting is retained") { implicit val arbHandleInvalid: Arbitrary[HandleInvalid] = Arbitrary { Gen.oneOf(HandleInvalid.Keep, HandleInvalid.Error, HandleInvalid.Skip) } val prop = forAll { handleInvalid: HandleInvalid => val indexer = TypedStringIndexer[X1[String]] .setHandleInvalid(handleInvalid) val ds = TypedDataset.create(Seq(X1("foo"))) val model = indexer.fit(ds).run() model.transformer.getHandleInvalid == handleInvalid.sparkValue } check(prop) } test("create() compiles only with correct inputs") { illTyped("TypedStringIndexer.create[Double]()") illTyped("TypedStringIndexer.create[X1[Double]]()") illTyped("TypedStringIndexer.create[X2[String, Long]]()") } } ================================================ FILE: ml/src/test/scala/frameless/ml/feature/TypedVectorAssemblerTests.scala ================================================ package frameless package ml package feature import org.scalacheck.Arbitrary import org.scalacheck.Prop._ import org.apache.spark.ml.linalg._ import shapeless.test.illTyped class TypedVectorAssemblerTests extends FramelessMlSuite { test(".transform() returns a correct TypedTransformer") { def prop[A: TypedEncoder: Arbitrary] = forAll { x5: X5[Int, Long, Double, Boolean, A] => val assembler = TypedVectorAssembler[X4[Int, Long, Double, Boolean]] val ds = TypedDataset.create(Seq(x5)) val ds2 = assembler.transform(ds).as[X6[Int, Long, Double, Boolean, A, Vector]]() ds2.collect().run() == Seq(X6(x5.a, x5.b, x5.c, x5.d, x5.e, Vectors.dense(x5.a.toDouble, x5.b.toDouble, x5.c, if (x5.d) 1D else 0D))) } def prop2[A: TypedEncoder: Arbitrary] = forAll { x5: X5[Boolean, BigDecimal, Byte, Short, A] => val assembler = TypedVectorAssembler[X4[Boolean, BigDecimal, Byte, Short]] val ds = TypedDataset.create(Seq(x5)) val ds2 = assembler.transform(ds).as[X6[Boolean, BigDecimal, Byte, Short, A, Vector]]() ds2.collect().run() == Seq(X6(x5.a, x5.b, x5.c, x5.d, x5.e, Vectors.dense(if (x5.a) 1D else 0D, x5.b.toDouble, x5.c.toDouble, x5.d.toDouble))) } check(prop[String]) check(prop[Double]) check(prop2[Long]) check(prop2[Boolean]) } test("create() compiles only with correct inputs") { illTyped("TypedVectorAssembler.create[Double]()") illTyped("TypedVectorAssembler.create[X1[String]]()") illTyped("TypedVectorAssembler.create[X2[String, Double]]()") illTyped("TypedVectorAssembler.create[X3[Int, String, Double]]()") } } ================================================ FILE: ml/src/test/scala/frameless/ml/regression/RegressionIntegrationTests.scala ================================================ package frameless package ml package regression import frameless.ml.feature.TypedVectorAssembler import org.apache.spark.ml.linalg.Vector import org.scalatest.matchers.must.Matchers class RegressionIntegrationTests extends FramelessMlSuite with Matchers { test("predict field3 from field1 and field2 using a RandomForestRegressor") { case class Data(field1: Double, field2: Int, field3: Double) // Training val trainingDataDs = TypedDataset.create(Seq.fill(10)(Data(0D, 10, 0D))) case class Features(field1: Double, field2: Int) val vectorAssembler = TypedVectorAssembler[Features] case class DataWithFeatures(field1: Double, field2: Int, field3: Double, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[DataWithFeatures]() case class RFInputs(field3: Double, features: Vector) val rf = TypedRandomForestRegressor[RFInputs] val model = rf.fit(dataWithFeatures).run() // Prediction val testData = TypedDataset.create(Seq( Data(0D, 10, 0D) )) val testDataWithFeatures = vectorAssembler.transform(testData).as[DataWithFeatures]() case class PredictionResult(field1: Double, field2: Int, field3: Double, features: Vector, predictedField3: Double) val predictionDs = model.transform(testDataWithFeatures).as[PredictionResult]() val prediction = predictionDs.select(predictionDs.col('predictedField3)).collect().run().toList prediction mustEqual List(0D) } } ================================================ FILE: ml/src/test/scala/frameless/ml/regression/TypedLinearRegressionTests.scala ================================================ package frameless package ml package regression import frameless.ml.params.linears.{LossStrategy, Solver} import org.apache.spark.ml.linalg._ import org.scalacheck.Arbitrary import org.scalacheck.Prop._ import org.scalatest.matchers.should.Matchers import shapeless.test.illTyped class TypedLinearRegressionTests extends FramelessMlSuite with Matchers { implicit val arbVectorNonEmpty: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary) test("fit() returns a correct TypedTransformer") { val prop = forAll { x2: X2[Double, Vector] => val lr = TypedLinearRegression[X2[Double, Vector]] val ds = TypedDataset.create(Seq(x2)) val model = lr.fit(ds).run() val pDs = model.transform(ds).as[X3[Double, Vector, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } val prop2 = forAll { x2: X2[Vector, Double] => val lr = TypedLinearRegression[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x2)) val model = lr.fit(ds).run() val pDs = model.transform(ds).as[X3[Vector, Double, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] => val lr = TypedLinearRegression[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x3)) val model = lr.fit(ds).run() val pDs = model.transform(ds).as[X4[Vector, Double, A, Double]]() pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c)) } check(prop) check(prop2) check(prop3[String]) check(prop3[Double]) } test("param setting is retained") { import Generators.{arbLossStrategy, arbSolver} val prop = forAll { (lossStrategy: LossStrategy, solver: Solver) => val lr = TypedLinearRegression[X2[Double, Vector]] .setAggregationDepth(10) .setEpsilon(4) .setFitIntercept(true) .setLoss(lossStrategy) .setMaxIter(23) .setRegParam(1.2) .setStandardization(true) .setTol(2.3) .setSolver(solver) val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D)))) val model = lr.fit(ds).run() model.transformer.getAggregationDepth == 10 && model.transformer.getEpsilon == 4.0 && model.transformer.getLoss == lossStrategy.sparkValue && model.transformer.getMaxIter == 23 && model.transformer.getRegParam == 1.2 && model.transformer.getTol == 2.3 && model.transformer.getSolver == solver.sparkValue } check(prop) } test("create() compiles only with correct inputs") { illTyped("TypedLinearRegressor.create[Double]()") illTyped("TypedLinearRegressor.create[X1[Double]]()") illTyped("TypedLinearRegressor.create[X2[Double, Double]]()") illTyped("TypedLinearRegressor.create[X3[Vector, Double, Int]]()") illTyped("TypedLinearRegressor.create[X2[Vector, String]]()") } test("TypedLinearRegressor should fit straight line ") { case class Point(features: Vector, labels: Double) val ds = Seq( X2(new DenseVector(Array(1.0)): Vector, 1.0), X2(new DenseVector(Array(2.0)): Vector, 2.0), X2(new DenseVector(Array(3.0)): Vector, 3.0), X2(new DenseVector(Array(4.0)): Vector, 4.0), X2(new DenseVector(Array(5.0)): Vector, 5.0), X2(new DenseVector(Array(6.0)): Vector, 6.0) ) val ds2 = Seq( X3(new DenseVector(Array(1.0)): Vector,2F, 1.0), X3(new DenseVector(Array(2.0)): Vector,2F, 2.0), X3(new DenseVector(Array(3.0)): Vector,2F, 3.0), X3(new DenseVector(Array(4.0)): Vector,2F, 4.0), X3(new DenseVector(Array(5.0)): Vector,2F, 5.0), X3(new DenseVector(Array(6.0)): Vector,2F, 6.0) ) val tds = TypedDataset.create(ds) val lr = TypedLinearRegression[X2[Vector, Double]] .setMaxIter(10) val model = lr.fit(tds).run() val tds2 = TypedDataset.create(ds2) val lr2 = TypedLinearRegression[X3[Vector, Float, Double]] .setMaxIter(10) val model2 = lr2.fit(tds2).run() model.transformer.coefficients shouldEqual new DenseVector(Array(1.0)) model2.transformer.coefficients shouldEqual new DenseVector(Array(1.0)) } } ================================================ FILE: ml/src/test/scala/frameless/ml/regression/TypedRandomForestRegressorTests.scala ================================================ package frameless package ml package regression import frameless.ml.params.trees.FeatureSubsetStrategy import shapeless.test.illTyped import org.apache.spark.ml.linalg._ import org.scalacheck.Arbitrary import org.scalacheck.Prop._ import org.scalatest.matchers.must.Matchers class TypedRandomForestRegressorTests extends FramelessMlSuite with Matchers { implicit val arbVectorNonEmpty: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary suchThat (_.size > 0)) // vector must not be empty for RandomForestRegressor import Generators.arbTreesFeaturesSubsetStrategy test("fit() returns a correct TypedTransformer") { val prop = forAll { x2: X2[Double, Vector] => val rf = TypedRandomForestRegressor[X2[Double, Vector]] val ds = TypedDataset.create(Seq(x2)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X3[Double, Vector, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } val prop2 = forAll { x2: X2[Vector, Double] => val rf = TypedRandomForestRegressor[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x2)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X3[Vector, Double, Double]]() pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b) } def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] => val rf = TypedRandomForestRegressor[X2[Vector, Double]] val ds = TypedDataset.create(Seq(x3)) val model = rf.fit(ds).run() val pDs = model.transform(ds).as[X4[Vector, Double, A, Double]]() pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c)) } check(prop) check(prop2) check(prop3[String]) check(prop3[Double]) } test("param setting is retained") { val prop = forAll { featureSubsetStrategy: FeatureSubsetStrategy => val rf = TypedRandomForestRegressor[X2[Double, Vector]] .setNumTrees(10) .setMaxBins(100) .setFeatureSubsetStrategy(featureSubsetStrategy) .setMaxDepth(10) .setMaxMemoryInMB(100) .setMinInfoGain(0.1D) .setMinInstancesPerNode(2) .setSubsamplingRate(0.9D) val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D)))) val model = rf.fit(ds).run() model.transformer.getNumTrees == 10 && model.transformer.getMaxBins == 100 && model.transformer.getFeatureSubsetStrategy == featureSubsetStrategy.sparkValue && model.transformer.getMaxDepth == 10 && model.transformer.getMaxMemoryInMB == 100 && model.transformer.getMinInfoGain == 0.1D && model.transformer.getMinInstancesPerNode == 2 && model.transformer.getSubsamplingRate == 0.9D } check(prop) } test("create() compiles only with correct inputs") { illTyped("TypedRandomForestRegressor.create[Double]()") illTyped("TypedRandomForestRegressor.create[X1[Double]]()") illTyped("TypedRandomForestRegressor.create[X2[Double, Double]]()") illTyped("TypedRandomForestRegressor.create[X3[Vector, Double, Int]]()") illTyped("TypedRandomForestRegressor.create[X2[Vector, String]]()") } } ================================================ FILE: project/Common.scala ================================================ import sbt.Keys._ import sbt._ import sbt.plugins.JvmPlugin import org.scalafmt.sbt.ScalafmtPlugin.autoImport._ object Common extends AutoPlugin { override def trigger = allRequirements override def requires = JvmPlugin override def projectSettings = Seq( scalafmtFilter := "diff-ref=78f708d" ) } ================================================ FILE: project/build.properties ================================================ sbt.version=1.12.11 ================================================ FILE: project/plugins.sbt ================================================ val sbtTypelevelVersion = "0.8.5" addSbtPlugin("org.typelevel" % "sbt-typelevel-ci-release" % sbtTypelevelVersion) addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % sbtTypelevelVersion) addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.4.4") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.6") addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.4") ================================================ FILE: refined/src/main/scala/frameless/refined/RefinedFieldEncoders.scala ================================================ package frameless.refined import scala.reflect.ClassTag import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.objects.{ Invoke, NewInstance, UnwrapOption, WrapOption } import org.apache.spark.sql.types._ import eu.timepit.refined.api.RefType import frameless.{ TypedEncoder, RecordFieldEncoder } private[refined] trait RefinedFieldEncoders { /** * @tparam T the refined type (e.g. `String`) */ implicit def optionRefined[F[_, _], T, R]( implicit i0: RefType[F], i1: TypedEncoder[T], i2: ClassTag[F[T, R]], ): RecordFieldEncoder[Option[F[T, R]]] = RecordFieldEncoder[Option[F[T, R]]](new TypedEncoder[Option[F[T, R]]] { def nullable = true // `Refined` is a Value class: https://github.com/fthomas/refined/blob/master/modules/core/shared/src/main/scala-3.0-/eu/timepit/refined/api/Refined.scala#L8 def jvmRepr = ObjectType(classOf[Option[F[T, R]]]) def catalystRepr: DataType = i1.catalystRepr val innerJvmRepr = ObjectType(i2.runtimeClass) def fromCatalyst(path: Expression): Expression = { val javaValue = i1.fromCatalyst(path) val value = NewInstance(i2.runtimeClass, Seq(javaValue), innerJvmRepr) WrapOption(value, innerJvmRepr) } @inline def toCatalyst(path: Expression): Expression = { val value = UnwrapOption(innerJvmRepr, path) val javaValue = Invoke(value, "value", i1.jvmRepr, Nil) i1.toCatalyst(javaValue) } override def toString = s"optionRefined[${i2.runtimeClass.getName}]" }) /** * @tparam T the refined type (e.g. `String`) */ implicit def refined[F[_, _], T, R]( implicit i0: RefType[F], i1: TypedEncoder[T], i2: ClassTag[F[T, R]], ): RecordFieldEncoder[F[T, R]] = RecordFieldEncoder[F[T, R]](new TypedEncoder[F[T, R]] { def nullable = i1.nullable // `Refined` is a Value class: https://github.com/fthomas/refined/blob/master/modules/core/shared/src/main/scala-3.0-/eu/timepit/refined/api/Refined.scala#L8 def jvmRepr = i1.jvmRepr def catalystRepr: DataType = i1.catalystRepr def fromCatalyst(path: Expression): Expression = i1.fromCatalyst(path) @inline def toCatalyst(path: Expression): Expression = i1.toCatalyst(path) override def toString = s"refined[${i2.runtimeClass.getName}]" }) } ================================================ FILE: refined/src/main/scala/frameless/refined/package.scala ================================================ package frameless import scala.reflect.ClassTag import eu.timepit.refined.api.{ RefType, Validate } package object refined extends RefinedFieldEncoders { implicit def refinedInjection[F[_, _], T, R]( implicit refType: RefType[F], validate: Validate[T, R] ): Injection[F[T, R], T] = Injection( refType.unwrap, { value => refType.refine[R](value) match { case Left(errMsg) => throw new IllegalArgumentException( s"Value $value does not satisfy refinement predicate: $errMsg") case Right(res) => res } }) implicit def refinedEncoder[F[_, _], T, R]( implicit i0: RefType[F], i1: Validate[T, R], i2: TypedEncoder[T], i3: ClassTag[F[T, R]] ): TypedEncoder[F[T, R]] = TypedEncoder.usingInjection( i3, refinedInjection, i2) } ================================================ FILE: refined/src/test/scala/frameless/RefinedFieldEncoderTests.scala ================================================ package frameless import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ IntegerType, ObjectType, StringType, StructField, StructType } import org.scalatest.matchers.should.Matchers class RefinedFieldEncoderTests extends TypedDatasetSuite with Matchers { test("Encode a bare refined type") { import eu.timepit.refined.auto._ import eu.timepit.refined.types.string.NonEmptyString val encoder: TypedEncoder[NonEmptyString] = { import frameless.refined.refinedEncoder TypedEncoder[NonEmptyString] } val ss = session import ss.implicits._ encoder.catalystRepr shouldBe StringType val nes: NonEmptyString = "Non Empty String" val unsafeDs = TypedDataset.createUnsafe(sc.parallelize(Seq(nes.value)).toDF())(encoder) val expected = Seq(nes) unsafeDs.collect().run() shouldBe expected } test("Encode case class with a refined field") { import RefinedTypesTests._ // Check jvmRepr import org.apache.spark.sql.types.ObjectType encoderA.jvmRepr shouldBe ObjectType(classOf[A]) // Check catalystRepr val expectedAStructType = StructType(Seq( StructField("a", IntegerType, false), StructField("s", StringType, false))) encoderA.catalystRepr shouldBe expectedAStructType // Check unsafe val unsafeDs: TypedDataset[A] = { val rdd = sc.parallelize(Seq(Row(as.a, as.s.toString))) val df = session.createDataFrame(rdd, expectedAStructType) TypedDataset.createUnsafe(df)(encoderA) } val expected = Seq(as) unsafeDs.collect().run() shouldBe expected // Check safe val safeDs = TypedDataset.create(expected) safeDs.collect().run() shouldBe expected } test("Encode case class with a refined optional field") { import RefinedTypesTests._ // Check jvmRepr encoderB.jvmRepr shouldBe ObjectType(classOf[B]) // Check catalystRepr val expectedBStructType = StructType(Seq( StructField("a", IntegerType, false), StructField("s", StringType, true))) encoderB.catalystRepr shouldBe expectedBStructType // Check unsafe val unsafeDs: TypedDataset[B] = { val rdd = sc.parallelize(Seq( Row(bs.a, bs.s.mkString), Row(2, null.asInstanceOf[String]), )) val df = session.createDataFrame(rdd, expectedBStructType) TypedDataset.createUnsafe(df)(encoderB) } val expected = Seq(bs, B(2, None)) unsafeDs.collect().run() shouldBe expected // Check safe val safeDs = TypedDataset.create(expected) safeDs.collect().run() shouldBe expected } } object RefinedTypesTests { import eu.timepit.refined.auto._ import eu.timepit.refined.types.string.NonEmptyString case class A(a: Int, s: NonEmptyString) case class B(a: Int, s: Option[NonEmptyString]) val nes: NonEmptyString = "Non Empty String" val as = A(-42, nes) val bs = B(-42, Option(nes)) import frameless.refined._ // implicit instances for refined implicit val encoderA: TypedEncoder[A] = TypedEncoder.usingDerivation implicit val encoderB: TypedEncoder[B] = TypedEncoder.usingDerivation } ================================================ FILE: scripts/docs-build.sh ================================================ #!/bin/bash set -eux sbt copyReadme mdoc gitbook="node_modules/gitbook-cli/bin/gitbook.js" if ! test -e $gitbook; then npm install gitbook npm install gitbook-cli fi $gitbook build mdocs/target/mdoc docs/book mv docs/book/* . exit 0 ================================================ FILE: scripts/docs-publish.sh ================================================ #!/bin/bash set -eux # Check that the working directory is a git repository and the repository has no outstanding changes. git diff-index --quiet HEAD commit=$(git show -s --format=%h) git checkout gh-pages git merge "$commit" bash scripts/docs-build.sh git add . git commit -am "Rebuild documentation ($commit)" echo "Verify that you didn't break anything:" echo " $ python -m SimpleHTTPServer 8000" echo " $ xdg-open http://localhost:8000/" echo "" echo "Then push to the gh-pages branch:" echo " $ git push gh-pages" ================================================ FILE: scripts/travis-publish.sh ================================================ #!/bin/bash # Taken + modified from typelevel/cats # https://github.com/typelevel/cats/blob/a8a7587f558541cbabc5c40053181928b4baf78c/scripts/travis-publish.sh export publish_cmd="publishLocal" # if [[ $TRAVIS_PULL_REQUEST == "false" && $TRAVIS_BRANCH == "master" && $(cat version.sbt) =~ "-SNAPSHOT" ]]; then # export publish_cmd="common/publish cats/publish dataset/publish dataframe/publish" # fi sbt_cmd="sbt ++$TRAVIS_SCALA_VERSION -Dfile.encoding=UTF8 -J-XX:ReservedCodeCacheSize=256M" case "$PHASE" in A) docs_cmd="$sbt_cmd doc tut" run_cmd="$docs_cmd" ;; B) coverage="$sbt_cmd coverage test && sbt coverageReport && bash <(curl -s https://codecov.io/bash)" run_cmd="$coverage" ;; C) run_cmd="$sbt_cmd clean $publish_cmd" ;; esac eval $run_cmd