Repository: adelbertc/frameless
Branch: master
Commit: 6826375be4c0
Files: 206
Total size: 773.8 KB

Directory structure:
gitextract_u5s1eutc/

├── .github/
│   ├── release-drafter.yml
│   └── workflows/
│       ├── ci.yml
│       ├── clean.yml
│       └── release-drafter.yml
├── .gitignore
├── .scalafmt.conf
├── LICENSE
├── README.md
├── build.sbt
├── cats/
│   └── src/
│       ├── main/
│       │   └── scala/
│       │       └── frameless/
│       │           └── cats/
│       │               ├── FramelessSyntax.scala
│       │               ├── SparkDelayInstances.scala
│       │               ├── SparkTask.scala
│       │               ├── implicits.scala
│       │               └── package.scala
│       └── test/
│           ├── resources/
│           │   ├── log4j.properties
│           │   └── log4j2.properties
│           └── scala/
│               └── frameless/
│                   └── cats/
│                       ├── FramelessSyntaxTests.scala
│                       └── test.scala
├── core/
│   └── src/
│       └── main/
│           └── scala/
│               └── frameless/
│                   ├── CatalystAverageable.scala
│                   ├── CatalystBitShift.scala
│                   ├── CatalystBitwise.scala
│                   ├── CatalystCast.scala
│                   ├── CatalystCollection.scala
│                   ├── CatalystDivisible.scala
│                   ├── CatalystIsin.scala
│                   ├── CatalystNaN.scala
│                   ├── CatalystNotNullable.scala
│                   ├── CatalystNumeric.scala
│                   ├── CatalystNumericWithJavaBigDecimal.scala
│                   ├── CatalystOrdered.scala
│                   ├── CatalystPivotable.scala
│                   ├── CatalystRound.scala
│                   ├── CatalystSummable.scala
│                   ├── CatalystVariance.scala
│                   ├── Injection.scala
│                   ├── SQLDate.scala
│                   └── SQLTimestamp.scala
├── dataset/
│   └── src/
│       ├── main/
│       │   ├── scala/
│       │   │   ├── frameless/
│       │   │   │   ├── FramelessSyntax.scala
│       │   │   │   ├── InjectionEnum.scala
│       │   │   │   ├── IsValueClass.scala
│       │   │   │   ├── Job.scala
│       │   │   │   ├── RecordEncoder.scala
│       │   │   │   ├── SparkDelay.scala
│       │   │   │   ├── TypedColumn.scala
│       │   │   │   ├── TypedColumnMacroImpl.scala
│       │   │   │   ├── TypedDataset.scala
│       │   │   │   ├── TypedDatasetForwarded.scala
│       │   │   │   ├── TypedEncoder.scala
│       │   │   │   ├── TypedExpressionEncoder.scala
│       │   │   │   ├── With.scala
│       │   │   │   ├── functions/
│       │   │   │   │   ├── AggregateFunctions.scala
│       │   │   │   │   ├── Lit.scala
│       │   │   │   │   ├── NonAggregateFunctions.scala
│       │   │   │   │   ├── Udf.scala
│       │   │   │   │   ├── UnaryFunctions.scala
│       │   │   │   │   └── package.scala
│       │   │   │   ├── ops/
│       │   │   │   │   ├── AggregateTypes.scala
│       │   │   │   │   ├── As.scala
│       │   │   │   │   ├── ColumnTypes.scala
│       │   │   │   │   ├── GroupByOps.scala
│       │   │   │   │   ├── RelationalGroupsOps.scala
│       │   │   │   │   ├── Repeat.scala
│       │   │   │   │   └── SmartProject.scala
│       │   │   │   └── syntax/
│       │   │   │       └── package.scala
│       │   │   └── org/
│       │   │       └── apache/
│       │   │           └── spark/
│       │   │               └── sql/
│       │   │                   ├── FramelessInternals.scala
│       │   │                   └── reflection/
│       │   │                       └── package.scala
│       │   ├── spark-3/
│       │   │   └── frameless/
│       │   │       └── MapGroups.scala
│       │   └── spark-3.4+/
│       │       └── frameless/
│       │           └── MapGroups.scala
│       └── test/
│           ├── resources/
│           │   ├── log4j.properties
│           │   └── log4j2.properties
│           ├── scala/
│           │   ├── frameless/
│           │   │   ├── AsTests.scala
│           │   │   ├── BitwiseTests.scala
│           │   │   ├── CastTests.scala
│           │   │   ├── ColTests.scala
│           │   │   ├── CollectTests.scala
│           │   │   ├── ColumnTests.scala
│           │   │   ├── ColumnViaLambdaTests.scala
│           │   │   ├── CreateTests.scala
│           │   │   ├── DropTest.scala
│           │   │   ├── DropTupledTest.scala
│           │   │   ├── EncoderTests.scala
│           │   │   ├── ExplodeTests.scala
│           │   │   ├── FilterTests.scala
│           │   │   ├── FlattenTests.scala
│           │   │   ├── GroupByTests.scala
│           │   │   ├── InjectionTests.scala
│           │   │   ├── IsValueClassTests.scala
│           │   │   ├── JobTests.scala
│           │   │   ├── JoinTests.scala
│           │   │   ├── LitTests.scala
│           │   │   ├── NumericTests.scala
│           │   │   ├── OrderByTests.scala
│           │   │   ├── RecordEncoderTests.scala
│           │   │   ├── SchemaTests.scala
│           │   │   ├── SelectTests.scala
│           │   │   ├── SelfJoinTests.scala
│           │   │   ├── TypedDatasetSuite.scala
│           │   │   ├── UdtEncodedClass.scala
│           │   │   ├── WithColumnTest.scala
│           │   │   ├── WithColumnTupledTest.scala
│           │   │   ├── XN.scala
│           │   │   ├── forward/
│           │   │   │   ├── CheckpointTests.scala
│           │   │   │   ├── ColumnsTests.scala
│           │   │   │   ├── CountTests.scala
│           │   │   │   ├── DistinctTests.scala
│           │   │   │   ├── ExceptTests.scala
│           │   │   │   ├── FirstTests.scala
│           │   │   │   ├── ForeachTests.scala
│           │   │   │   ├── HeadTests.scala
│           │   │   │   ├── InputFilesTests.scala
│           │   │   │   ├── IntersectTests.scala
│           │   │   │   ├── IsLocalTests.scala
│           │   │   │   ├── IsStreamingTests.scala
│           │   │   │   ├── LimitTests.scala
│           │   │   │   ├── QueryExecutionTests.scala
│           │   │   │   ├── RandomSplitTests.scala
│           │   │   │   ├── SQLContextTests.scala
│           │   │   │   ├── SparkSessionTests.scala
│           │   │   │   ├── StorageLevelTests.scala
│           │   │   │   ├── TakeTests.scala
│           │   │   │   ├── ToJSONTests.scala
│           │   │   │   ├── ToLocalIteratorTests.scala
│           │   │   │   ├── UnionTests.scala
│           │   │   │   ├── WriteStreamTests.scala
│           │   │   │   └── WriteTests.scala
│           │   │   ├── functions/
│           │   │   │   ├── AggregateFunctionsTests.scala
│           │   │   │   ├── DateTimeStringBehaviourUtils.scala
│           │   │   │   ├── DoubleBehaviourUtils.scala
│           │   │   │   ├── NonAggregateFunctionsTests.scala
│           │   │   │   ├── UdfTests.scala
│           │   │   │   └── UnaryFunctionsTest.scala
│           │   │   ├── ops/
│           │   │   │   ├── ColumnTypesTest.scala
│           │   │   │   ├── CubeTests.scala
│           │   │   │   ├── PivotTest.scala
│           │   │   │   ├── RepeatTest.scala
│           │   │   │   ├── RollupTests.scala
│           │   │   │   ├── SmartProjectTest.scala
│           │   │   │   └── deserialized/
│           │   │   │       ├── FilterTests.scala
│           │   │   │       ├── FlatMapTests.scala
│           │   │   │       ├── MapPartitionsTests.scala
│           │   │   │       ├── MapTests.scala
│           │   │   │       └── ReduceTests.scala
│           │   │   ├── package.scala
│           │   │   ├── sql/
│           │   │   │   ├── package.scala
│           │   │   │   └── rules/
│           │   │   │       └── SQLRulesSuite.scala
│           │   │   └── syntax/
│           │   │       └── FramelessSyntaxTests.scala
│           │   └── org/
│           │       └── apache/
│           │           └── hadoop/
│           │               └── fs/
│           │                   └── local/
│           │                       └── StreamingFS.scala
│           ├── spark-3.2/
│           │   └── frameless/
│           │       └── sql/
│           │           └── rules/
│           │               └── FramelessLitPushDownTests.scala
│           └── spark-3.3+/
│               └── frameless/
│                   └── sql/
│                       └── rules/
│                           └── FramelessLitPushDownTests.scala
├── docs/
│   ├── Cats.md
│   ├── FeatureOverview.md
│   ├── Injection.md
│   ├── Job.md
│   ├── TypedDataFrame.md
│   ├── TypedDatasetVsSparkDataset.md
│   ├── TypedEncoder.md
│   ├── TypedML.md
│   ├── WorkingWithCsvParquetJson.md
│   ├── directory.conf
│   ├── iris.data
│   └── iris.parquet
├── github.sbt
├── ml/
│   └── src/
│       ├── main/
│       │   └── scala/
│       │       ├── frameless/
│       │       │   └── ml/
│       │       │       ├── TypedEstimator.scala
│       │       │       ├── TypedTransformer.scala
│       │       │       ├── classification/
│       │       │       │   └── TypedRandomForestClassifier.scala
│       │       │       ├── clustering/
│       │       │       │   ├── TypedBisectingKMeans.scala
│       │       │       │   └── TypedKMeans.scala
│       │       │       ├── feature/
│       │       │       │   ├── TypedIndexToString.scala
│       │       │       │   ├── TypedStringIndexer.scala
│       │       │       │   └── TypedVectorAssembler.scala
│       │       │       ├── internals/
│       │       │       │   ├── LinearInputsChecker.scala
│       │       │       │   ├── SelectorByValue.scala
│       │       │       │   ├── TreesInputsChecker.scala
│       │       │       │   ├── UnaryInputsChecker.scala
│       │       │       │   └── VectorInputsChecker.scala
│       │       │       ├── package.scala
│       │       │       ├── params/
│       │       │       │   ├── kmeans/
│       │       │       │   │   └── KMeansInitMode.scala
│       │       │       │   ├── linears/
│       │       │       │   │   ├── LossStrategy.scala
│       │       │       │   │   └── Solver.scala
│       │       │       │   └── trees/
│       │       │       │       └── FeatureSubsetStrategy.scala
│       │       │       └── regression/
│       │       │           ├── TypedLinearRegression.scala
│       │       │           └── TypedRandomForestRegressor.scala
│       │       └── org/
│       │           └── apache/
│       │               └── spark/
│       │                   └── ml/
│       │                       └── FramelessInternals.scala
│       └── test/
│           └── scala/
│               └── frameless/
│                   └── ml/
│                       ├── FramelessMlSuite.scala
│                       ├── Generators.scala
│                       ├── TypedEncoderInstancesTests.scala
│                       ├── classification/
│                       │   ├── ClassificationIntegrationTests.scala
│                       │   └── TypedRandomForestClassifierTests.scala
│                       ├── clustering/
│                       │   ├── BisectingKMeansTests.scala
│                       │   ├── ClusteringIntegrationTests.scala
│                       │   └── KMeansTests.scala
│                       ├── feature/
│                       │   ├── TypedIndexToStringTests.scala
│                       │   ├── TypedStringIndexerTests.scala
│                       │   └── TypedVectorAssemblerTests.scala
│                       └── regression/
│                           ├── RegressionIntegrationTests.scala
│                           ├── TypedLinearRegressionTests.scala
│                           └── TypedRandomForestRegressorTests.scala
├── project/
│   ├── Common.scala
│   ├── build.properties
│   └── plugins.sbt
├── refined/
│   └── src/
│       ├── main/
│       │   └── scala/
│       │       └── frameless/
│       │           └── refined/
│       │               ├── RefinedFieldEncoders.scala
│       │               └── package.scala
│       └── test/
│           └── scala/
│               └── frameless/
│                   └── RefinedFieldEncoderTests.scala
└── scripts/
    ├── docs-build.sh
    ├── docs-publish.sh
    └── travis-publish.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/release-drafter.yml
================================================
name-template: 'v$NEXT_MINOR_VERSION'
tag-template: 'v$NEXT_MINOR_VERSION'
categories:
  - title: 'Added'
    labels:
      - 'feature'
  - title: 'Changed'
    labels:
      - 'enhancement'
      - 'dependency-update'
  - title: 'Fixed'
    labels:
      - 'fix'
      - 'bug'
include-labels:
  - 'feature'
  - 'enhancement'
  - 'dependency-update'
  - 'fix'
  - 'bug'
exclude-labels:
  - 'skip-changelog'
  - 'documentation'
  - 'build/process improvement'
change-template: '- $TITLE [#$NUMBER](https://github.com/typelevel/frameless/pull/$NUMBER) (@$AUTHOR)'
template: |
  $CHANGES


================================================
FILE: .github/workflows/ci.yml
================================================
# This file was automatically generated by sbt-github-actions using the
# githubWorkflowGenerate task. You should add and commit this file to
# your git repository. It goes without saying that you shouldn't edit
# this file by hand! Instead, if you wish to make changes, you should
# change your sbt build configuration to revise the workflow description
# to meet your needs, then regenerate this file.

name: Continuous Integration

on:
  pull_request:
    branches: ['**', '!update/**', '!pr/**']
  push:
    branches: ['**', '!update/**', '!pr/**']
    tags: [v*]

env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  SBT_OPTS: '-Xms1g -Xmx4g'
  SPARK_LOCAL_IP: localhost


concurrency:
  group: ${{ github.workflow }} @ ${{ github.ref }}
  cancel-in-progress: true

jobs:
  build:
    name: Test
    strategy:
      matrix:
        os: [ubuntu-22.04]
        scala: [2.13, 2.12]
        java: [temurin@8]
        project: [root-spark33, root-spark34, root-spark35]
        exclude:
          - scala: 2.13
            project: root-spark33
          - scala: 2.13
            project: root-spark34
    runs-on: ${{ matrix.os }}
    timeout-minutes: 60
    steps:
      - name: Checkout current branch (full)
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Setup sbt
        uses: sbt/setup-sbt@v1

      - name: Setup Java (temurin@8)
        id: setup-java-temurin-8
        if: matrix.java == 'temurin@8'
        uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: 8
          cache: sbt

      - name: sbt update
        if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
        run: sbt +update

      - name: Check that workflows are up to date
        run: sbt githubWorkflowCheck

      - name: Check formatting
        if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
        run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' scalafmtCheckAll 'project /' scalafmtSbtCheck

      - name: Test & Compute Coverage
        run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' coverage test test/coverageReport

      - name: Check binary compatibility
        if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
        run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' mimaReportBinaryIssues

      - name: Generate API documentation
        if: matrix.java == 'temurin@8' && matrix.os == 'ubuntu-22.04'
        run: sbt '++ ${{ matrix.scala }}' 'project ${{ matrix.project }}' doc

      - uses: codecov/codecov-action@v3
        with:
          flags: ${{ matrix.scala }}-${{ matrix.project }}

  publish:
    name: Publish Artifacts
    needs: [build]
    if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/master')
    strategy:
      matrix:
        os: [ubuntu-22.04]
        java: [temurin@8]
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout current branch (full)
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Setup sbt
        uses: sbt/setup-sbt@v1

      - name: Setup Java (temurin@8)
        id: setup-java-temurin-8
        if: matrix.java == 'temurin@8'
        uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: 8
          cache: sbt

      - name: sbt update
        if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
        run: sbt +update

      - name: Import signing key
        if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE == ''
        env:
          PGP_SECRET: ${{ secrets.PGP_SECRET }}
          PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
        run: echo $PGP_SECRET | base64 -d -i - | gpg --import

      - name: Import signing key and strip passphrase
        if: env.PGP_SECRET != '' && env.PGP_PASSPHRASE != ''
        env:
          PGP_SECRET: ${{ secrets.PGP_SECRET }}
          PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
        run: |
          echo "$PGP_SECRET" | base64 -d -i - > /tmp/signing-key.gpg
          echo "$PGP_PASSPHRASE" | gpg --pinentry-mode loopback --passphrase-fd 0 --import /tmp/signing-key.gpg
          (echo "$PGP_PASSPHRASE"; echo; echo) | gpg --command-fd 0 --pinentry-mode loopback --change-passphrase $(gpg --list-secret-keys --with-colons 2> /dev/null | grep '^sec:' | cut --delimiter ':' --fields 5 | tail -n 1)

      - name: Publish
        env:
          SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
          SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
          SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }}
        run: sbt tlCiRelease

  dependency-submission:
    name: Submit Dependencies
    if: github.event.repository.fork == false && github.event_name != 'pull_request'
    strategy:
      matrix:
        os: [ubuntu-22.04]
        java: [temurin@8]
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout current branch (full)
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Setup sbt
        uses: sbt/setup-sbt@v1

      - name: Setup Java (temurin@8)
        id: setup-java-temurin-8
        if: matrix.java == 'temurin@8'
        uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: 8
          cache: sbt

      - name: sbt update
        if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
        run: sbt +update

      - name: Submit Dependencies
        uses: scalacenter/sbt-dependency-submission@v2
        with:
          modules-ignore: root-spark33_2.13 root-spark33_2.12 docs_2.13 docs_2.12 root-spark34_2.13 root-spark34_2.12 root-spark35_2.13 root-spark35_2.12
          configs-ignore: test scala-tool scala-doc-tool test-internal

  site:
    name: Generate Site
    strategy:
      matrix:
        os: [ubuntu-22.04]
        java: [temurin@11]
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout current branch (full)
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

      - name: Setup sbt
        uses: sbt/setup-sbt@v1

      - name: Setup Java (temurin@8)
        id: setup-java-temurin-8
        if: matrix.java == 'temurin@8'
        uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: 8
          cache: sbt

      - name: sbt update
        if: matrix.java == 'temurin@8' && steps.setup-java-temurin-8.outputs.cache-hit == 'false'
        run: sbt +update

      - name: Setup Java (temurin@11)
        id: setup-java-temurin-11
        if: matrix.java == 'temurin@11'
        uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: 11
          cache: sbt

      - name: sbt update
        if: matrix.java == 'temurin@11' && steps.setup-java-temurin-11.outputs.cache-hit == 'false'
        run: sbt +update

      - name: Generate site
        run: sbt docs/tlSite

      - name: Publish site
        if: github.event_name != 'pull_request' && github.ref == 'refs/heads/master'
        uses: peaceiris/actions-gh-pages@v4.0.0
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: mdocs/target/docs/site
          keep_files: true


================================================
FILE: .github/workflows/clean.yml
================================================
# This file was automatically generated by sbt-github-actions using the
# githubWorkflowGenerate task. You should add and commit this file to
# your git repository. It goes without saying that you shouldn't edit
# this file by hand! Instead, if you wish to make changes, you should
# change your sbt build configuration to revise the workflow description
# to meet your needs, then regenerate this file.

name: Clean

on: push

jobs:
  delete-artifacts:
    name: Delete Artifacts
    runs-on: ubuntu-latest
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
      - name: Delete artifacts
        run: |
          # Customize those three lines with your repository and credentials:
          REPO=${GITHUB_API_URL}/repos/${{ github.repository }}

          # A shortcut to call GitHub API.
          ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; }

          # A temporary file which receives HTTP response headers.
          TMPFILE=/tmp/tmp.$$

          # An associative array, key: artifact name, value: number of artifacts of that name.
          declare -A ARTCOUNT

          # Process all artifacts on this repository, loop on returned "pages".
          URL=$REPO/actions/artifacts
          while [[ -n "$URL" ]]; do

            # Get current page, get response headers in a temporary file.
            JSON=$(ghapi --dump-header $TMPFILE "$URL")

            # Get URL of next page. Will be empty if we are at the last page.
            URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*<//' -e 's/>.*//')
            rm -f $TMPFILE

            # Number of artifacts on this page:
            COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') ))

            # Loop on all artifacts on this page.
            for ((i=0; $i < $COUNT; i++)); do

              # Get name of artifact and count instances of this name.
              name=$(jq <<<$JSON -r ".artifacts[$i].name?")
              ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1))

              id=$(jq <<<$JSON -r ".artifacts[$i].id?")
              size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") ))
              printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size
              ghapi -X DELETE $REPO/actions/artifacts/$id
            done
          done


================================================
FILE: .github/workflows/release-drafter.yml
================================================
name: Release Drafter

on:
  push:
    branches:
      - master
  pull_request:
    types: [opened, reopened, synchronize]

jobs:
  update_release_draft:
    runs-on: ubuntu-latest
    steps:
      - uses: release-drafter/release-drafter@v5.15.0
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
*.class
*.log

# sbt specific
.bsp/
dist/*
target/
lib_managed/
src_managed/
project/boot/
project/plugins/project/

# Scala-IDE specific
.scala_dependencies
.cache
.classpath
.project
.worksheet/
bin/
.settings/
.ensime
.ensime_cache/

# IntelliJ specific
.idea

# OS X
.DS_Store
node_modules

# VSCode
.history
.metals
.vscode
.bloop
metals.sbt


================================================
FILE: .scalafmt.conf
================================================
version = 3.8.6
runner.dialect = scala213

newlines.beforeMultilineDef = keep
newlines.topLevelStatements = [before]
newlines.beforeCurlyLambdaParams = multilineWithCaseOnly
newlines.afterCurlyLambdaParams = squash
newlines.implicitParamListModifierForce = [after]
newlines.avoidForSimpleOverflow = [tooLong]
newlines.avoidInResultType = true
newlines.sometimesBeforeColonInMethodReturnType = false
newlines.beforeTypeBounds = keep

verticalMultiline.atDefnSite = true
verticalMultiline.arityThreshold = 10

spaces.inImportCurlyBraces = true

includeCurlyBraceInSelectChains = false
includeNoParensInSelectChains = false
optIn.breakChainOnFirstMethodDot = false

docstrings.style = Asterisk
docstrings.wrap = no

literals.long=Upper
literals.float=Upper
literals.double=Upper


================================================
FILE: LICENSE
================================================
Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright {yyyy} {name of copyright owner}

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Frameless

[![Workflow Badge](https://github.com/typelevel/frameless/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/typelevel/frameless/actions/workflows/ci.yml)
[![Codecov Badge](https://codecov.io/gh/typelevel/frameless/branch/master/graph/badge.svg)](https://codecov.io/gh/typelevel/frameless)
[![Discord Badge](https://img.shields.io/badge/chat-on%20discord-46BC99)](https://discord.gg/ZDZsxWcBJt)
[![Maven Badge](https://img.shields.io/maven-central/v/org.typelevel/frameless-core_2.12?color=blue)](https://search.maven.org/search?q=g:org.typelevel%20and%20frameless)
[![Snapshots Badge](https://img.shields.io/nexus/s/https/s01.oss.sonatype.org/org.typelevel/frameless-core_2.12)](https://s01.oss.sonatype.org/content/repositories/snapshots/org/typelevel/frameless-core_2.12/)

Frameless is a Scala library for working with [Spark](http://spark.apache.org/) using more expressive types.
It consists of the following modules:

* `frameless-dataset` for a more strongly typed `Dataset`/`DataFrame` API
* `frameless-ml` for a more strongly typed Spark ML API based on `frameless-dataset`
* `frameless-cats` for using Spark's `RDD` API with [cats](https://github.com/typelevel/cats)

Note that while Frameless is still getting off the ground, it is very possible that breaking changes will be
made for at least the next few versions.

The Frameless project and contributors support the
[Typelevel](http://typelevel.org/) [Code of Conduct](http://typelevel.org/code-of-conduct.html) and want all its
associated channels (e.g. GitHub, Discord) to be a safe and friendly environment for contributing and learning.

## Versions and dependencies

The compatible versions of [Spark](http://spark.apache.org/) and
[cats](https://github.com/typelevel/cats) are as follows:

| Frameless | Spark                       | Cats     | Cats-Effect | Scala       |
|-----------|-----------------------------|----------|-------------|-------------|
| 0.16.0    | 3.5.0 / 3.4.0 / 3.3.0       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.15.0    | 3.4.0 / 3.3.0 / 3.2.2       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.14.1    | 3.4.0 / 3.3.0 / 3.2.2       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.14.0    | 3.3.0 / 3.2.2 / 3.1.3       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.13.0    | 3.3.0 / 3.2.2 / 3.1.3       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.12.0    | 3.2.1 / 3.1.3 / 3.0.3       | 2.x      | 3.x         | 2.12 / 2.13 |
| 0.11.1    | 3.2.0 / 3.1.2 / 3.0.1       | 2.x      | 2.x         | 2.12 / 2.13 |
| 0.11.0*   | 3.2.0 / 3.1.2 / 3.0.1       | 2.x      | 2.x         | 2.12 / 2.13 |
| 0.10.1    | 3.1.0                       | 2.x      | 2.x         | 2.12        |
| 0.9.0     | 3.0.0                       | 1.x      | 1.x         | 2.12        |
| 0.8.0     | 2.4.0                       | 1.x      | 1.x         | 2.11 / 2.12 |
| 0.7.0     | 2.3.1                       | 1.x      | 1.x         | 2.11        |
| 0.6.1     | 2.3.0                       | 1.x      | 0.8         | 2.11        |
| 0.5.2     | 2.2.1                       | 1.x      | 0.8         | 2.11        |
| 0.4.1     | 2.2.0                       | 1.x      | 0.8         | 2.11        |
| 0.4.0     | 2.2.0                       | 1.0.0-IF | 0.4         | 2.11        |

_\* 0.11.0 has broken Spark 3.1.2 and 3.0.1 artifacts published._

Starting 0.11 we introduced Spark cross published artifacts:

* By default, frameless artifacts depend on the most recent Spark version
* Suffix `-spark{major}{minor}` is added to artifacts that are released for the previous Spark version(s)

Artifact names examples:

* `frameless-dataset` (the latest Spark dependency)
* `frameless-dataset-spark33` (Spark 3.3.x dependency)
* `frameless-dataset-spark32` (Spark 3.2.x dependency)

Versions 0.5.x and 0.6.x have identical features. The first is compatible with Spark 2.2.1 and the second with 2.3.0.

The **only** dependency of the `frameless-dataset` module is on [shapeless](https://github.com/milessabin/shapeless) 2.3.2.
Therefore, depending on `frameless-dataset`, has a minimal overhead on your Spark's application jar.
Only the `frameless-cats` module depends on cats and cats-effect, so if you prefer to work just with `Datasets` and not with `RDD`s,
you may choose not to depend on `frameless-cats`.

Frameless intentionally **does not** have a compile dependency on Spark.
This essentially allows you to use any version of Frameless with any version of Spark.
The aforementioned table simply provides the versions of Spark we officially compile
and test Frameless with, but other versions may probably work as well.

### Breaking changes in 0.9

* Spark 3 introduces a new ExpressionEncoder approach, the schema for single value DataFrame's is now ["value"](https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala#L270) not "_1".

## Why?

Frameless introduces a new Spark API, called `TypedDataset`.
The benefits of using `TypedDataset` compared to the standard Spark `Dataset` API are as follows:

* Typesafe columns referencing (e.g., no more runtime errors when accessing non-existing columns)
* Customizable, typesafe encoders (e.g., if a type does not have an encoder, it should not compile)
* Enhanced type signature for built-in functions (e.g., if you apply an arithmetic operation on a non-numeric column, you
get a compilation error)
* Typesafe casting and projections

Click [here](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html) for a
detailed comparison of `TypedDataset` with Spark's `Dataset` API.

## Documentation

* [TypedDataset: Feature Overview](http://typelevel.org/frameless/FeatureOverview.html)
* [Typed Spark ML](http://typelevel.org/frameless/TypedML.html)
* [Comparing TypedDatasets with Spark's Datasets](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html)
* [Typed Encoders in Frameless](http://typelevel.org/frameless/TypedEncoder.html)
* [Injection: Creating Custom Encoders](http://typelevel.org/frameless/Injection.html)
* [Job\[A\]](http://typelevel.org/frameless/Job.html)
* [Using Cats with RDDs](http://typelevel.org/frameless/Cats.html)
* [Proof of Concept: TypedDataFrame](http://typelevel.org/frameless/TypedDataFrame.html)

## Quick Start

Since the 0.9.x release, Frameless is compiled only against Scala 2.12.x.

To use Frameless in your project add the following in your `build.sbt` file as needed:

```scala
val framelessVersion = "<latest version>"

resolvers ++= Seq(
  // for snapshot artifacts only
  "s01-oss-sonatype" at "https://s01.oss.sonatype.org/content/repositories/snapshots"
)

libraryDependencies ++= List(
  "org.typelevel" %% "frameless-dataset" % framelessVersion,
  "org.typelevel" %% "frameless-ml"      % framelessVersion,
  "org.typelevel" %% "frameless-cats"    % framelessVersion
)
```

An easy way to bootstrap a Frameless sbt project:

* if you have [Giter8][g8] installed then simply:

```bash
g8 imarios/frameless.g8
```

- with sbt >= 0.13.13:

```bash
sbt new imarios/frameless.g8
```

Typing `sbt console` inside your project will bring up a shell with Frameless
and all its dependencies loaded (including Spark).

## Need help?

Feel free to messages us on our [discord](https://discord.gg/ZDZsxWcBJt)
channel for any issues/questions.

## Development

We require at least _one_ sign-off (thumbs-up, +1, or similar) to merge pull requests. The current maintainers
(people who can merge pull requests) are:

* [adelbertc](https://github.com/adelbertc)
* [imarios](https://github.com/imarios)
* [kanterov](https://github.com/kanterov)
* [non](https://github.com/non)
* [OlivierBlanvillain](https://github.com/OlivierBlanvillain/)

### Testing

Frameless contains several property tests.  To avoid `OutOfMemoryError`s, we
tune the default generator sizes.  The following environment variables may
be set to adjust the size of generated collections in the `TypedDataSet` suite:

| Property                    | Default |
|-----------------------------|--------:|
| FRAMELESS_GEN_MIN_SIZE      |       0 |
| FRAMELESS_GEN_SIZE_RANGE    |      20 |

## License

Code is provided under the Apache 2.0 license available at <http://opensource.org/licenses/Apache-2.0>,
as well as in the LICENSE file. This is the same license used as Spark.

[g8]: http://www.foundweekends.org/giter8/


================================================
FILE: build.sbt
================================================
val sparkVersion = "3.5.8"
val spark34Version = "3.4.4"
val spark33Version = "3.3.4"
val catsCoreVersion = "2.13.0"
val catsEffectVersion = "3.7.0"
val catsMtlVersion = "1.6.0"
val scalatest = "3.2.20"
val scalatestplus = "3.1.0.0-RC2"
val shapeless = "2.3.13"
val scalacheck = "1.19.0"
val scalacheckEffect = "2.1.0"
val refinedVersion = "0.11.3"
val nakedFSVersion = "0.1.0"

val Scala212 = "2.12.20"
val Scala213 = "2.13.18"

ThisBuild / tlBaseVersion := "0.16"

ThisBuild / crossScalaVersions := Seq(Scala213, Scala212)
ThisBuild / scalaVersion := Scala212
ThisBuild / coverageScalacPluginVersion := "2.3.0"

lazy val root = project
  .in(file("."))
  .enablePlugins(NoPublishPlugin)
  .settings(crossScalaVersions := Nil)
  .aggregate(
    `root-spark35`,
    `root-spark34`,
    `root-spark33`,
    docs
  )

lazy val `root-spark35` = project
  .in(file(".spark35"))
  .enablePlugins(NoPublishPlugin)
  .aggregate(core, cats, dataset, refined, ml)

lazy val `root-spark34` = project
  .in(file(".spark34"))
  .enablePlugins(NoPublishPlugin)
  .aggregate(
    core,
    `cats-spark34`,
    `dataset-spark34`,
    `refined-spark34`,
    `ml-spark34`
  )

lazy val `root-spark33` = project
  .in(file(".spark33"))
  .enablePlugins(NoPublishPlugin)
  .aggregate(
    core,
    `cats-spark33`,
    `dataset-spark33`,
    `refined-spark33`,
    `ml-spark33`
  )

lazy val core =
  project.settings(name := "frameless-core").settings(framelessSettings)

lazy val cats = project
  .settings(name := "frameless-cats")
  .settings(catsSettings)
  .dependsOn(dataset % "test->test;compile->compile;provided->provided")

lazy val `cats-spark34` = project
  .settings(name := "frameless-cats-spark34")
  .settings(sourceDirectory := (cats / sourceDirectory).value)
  .settings(catsSettings)
  .settings(spark34Settings)
  .dependsOn(
    `dataset-spark34` % "test->test;compile->compile;provided->provided"
  )

lazy val `cats-spark33` = project
  .settings(name := "frameless-cats-spark33")
  .settings(sourceDirectory := (cats / sourceDirectory).value)
  .settings(catsSettings)
  .settings(spark33Settings)
  .dependsOn(
    `dataset-spark33` % "test->test;compile->compile;provided->provided"
  )

lazy val dataset = project
  .settings(name := "frameless-dataset")
  .settings(
    Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "spark-3.4+"
  )
  .settings(
    Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "spark-3.3+"
  )
  .settings(datasetSettings)
  .settings(sparkDependencies(sparkVersion))
  .dependsOn(core % "test->test;compile->compile")

lazy val `dataset-spark34` = project
  .settings(name := "frameless-dataset-spark34")
  .settings(sourceDirectory := (dataset / sourceDirectory).value)
  .settings(
    Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3.4+"
  )
  .settings(
    Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+"
  )
  .settings(datasetSettings)
  .settings(sparkDependencies(spark34Version))
  .settings(spark34Settings)
  .dependsOn(core % "test->test;compile->compile")

lazy val `dataset-spark33` = project
  .settings(name := "frameless-dataset-spark33")
  .settings(sourceDirectory := (dataset / sourceDirectory).value)
  .settings(
    Compile / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "main" / "spark-3"
  )
  .settings(
    Test / unmanagedSourceDirectories += (dataset / baseDirectory).value / "src" / "test" / "spark-3.3+"
  )
  .settings(datasetSettings)
  .settings(sparkDependencies(spark33Version))
  .settings(spark33Settings)
  .dependsOn(core % "test->test;compile->compile")

lazy val refined = project
  .settings(name := "frameless-refined")
  .settings(refinedSettings)
  .dependsOn(dataset % "test->test;compile->compile;provided->provided")

lazy val `refined-spark34` = project
  .settings(name := "frameless-refined-spark34")
  .settings(sourceDirectory := (refined / sourceDirectory).value)
  .settings(refinedSettings)
  .settings(spark34Settings)
  .dependsOn(
    `dataset-spark34` % "test->test;compile->compile;provided->provided"
  )

lazy val `refined-spark33` = project
  .settings(name := "frameless-refined-spark33")
  .settings(sourceDirectory := (refined / sourceDirectory).value)
  .settings(refinedSettings)
  .settings(spark33Settings)
  .dependsOn(
    `dataset-spark33` % "test->test;compile->compile;provided->provided"
  )

lazy val ml = project
  .settings(name := "frameless-ml")
  .settings(mlSettings)
  .settings(sparkMlDependencies(sparkVersion))
  .dependsOn(
    core % "test->test;compile->compile",
    dataset % "test->test;compile->compile;provided->provided"
  )

lazy val `ml-spark34` = project
  .settings(name := "frameless-ml-spark34")
  .settings(sourceDirectory := (ml / sourceDirectory).value)
  .settings(mlSettings)
  .settings(sparkMlDependencies(spark34Version))
  .settings(spark34Settings)
  .dependsOn(
    core % "test->test;compile->compile",
    `dataset-spark34` % "test->test;compile->compile;provided->provided"
  )

lazy val `ml-spark33` = project
  .settings(name := "frameless-ml-spark33")
  .settings(sourceDirectory := (ml / sourceDirectory).value)
  .settings(mlSettings)
  .settings(sparkMlDependencies(spark33Version))
  .settings(spark33Settings)
  .dependsOn(
    core % "test->test;compile->compile",
    `dataset-spark33` % "test->test;compile->compile;provided->provided"
  )

lazy val docs = project
  .in(file("mdocs"))
  .settings(framelessSettings)
  .settings(scalacOptions --= Seq("-Xfatal-warnings", "-Ywarn-unused-import"))
  .enablePlugins(TypelevelSitePlugin)
  .settings(sparkDependencies(sparkVersion, Compile))
  .settings(sparkMlDependencies(sparkVersion, Compile))
  .settings(
    addCompilerPlugin(
      "org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full
    ),
    scalacOptions += "-Ydelambdafy:inline",
    libraryDependencies += "org.typelevel" %% "mouse" % "1.3.2"
  )
  .dependsOn(dataset, cats, ml)

def sparkDependencies(
    sparkVersion: String,
    scope: Configuration = Provided
  ) = Seq(
  libraryDependencies ++= Seq(
    "org.apache.spark" %% "spark-core" % sparkVersion % scope,
    "org.apache.spark" %% "spark-sql" % sparkVersion % scope
  )
)

def sparkMlDependencies(sparkVersion: String, scope: Configuration = Provided) =
  Seq(
    libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion % scope
  )

lazy val catsSettings = framelessSettings ++ Seq(
  addCompilerPlugin(
    "org.typelevel" % "kind-projector" % "0.13.4" cross CrossVersion.full
  ),
  libraryDependencies ++= Seq(
    "org.typelevel" %% "cats-core" % catsCoreVersion,
    "org.typelevel" %% "cats-effect" % catsEffectVersion,
    "org.typelevel" %% "cats-mtl" % catsMtlVersion,
    "org.typelevel" %% "alleycats-core" % catsCoreVersion,
    "org.typelevel" %% "scalacheck-effect" % scalacheckEffect % Test
  )
)

lazy val datasetSettings =
  framelessSettings ++ framelessTypedDatasetREPL ++ Seq(
    mimaBinaryIssueFilters ++= {
      import com.typesafe.tools.mima.core._

      val imt = ProblemFilters.exclude[IncompatibleMethTypeProblem](_)
      val mc = ProblemFilters.exclude[MissingClassProblem](_)
      val dmm = ProblemFilters.exclude[DirectMissingMethodProblem](_)

      // TODO: Remove have version bump
      Seq(
        imt("frameless.TypedEncoder.mapEncoder"),
        imt("frameless.TypedEncoder.arrayEncoder"),
        imt("frameless.RecordEncoderFields.deriveRecordCons"),
        imt("frameless.RecordEncoderFields.deriveRecordLast"),
        mc("frameless.functions.FramelessLit"),
        mc(f"frameless.functions.FramelessLit$$"),
        dmm("frameless.functions.package.litAggr"),
        dmm("org.apache.spark.sql.FramelessInternals.column")
      )
    },
    coverageExcludedPackages := "org.apache.spark.sql.reflection",
    libraryDependencies += "com.globalmentor" % "hadoop-bare-naked-local-fs" % nakedFSVersion % Test exclude (
      "org.apache.hadoop",
      "hadoop-commons"
    )
  )

lazy val refinedSettings =
  framelessSettings ++ framelessTypedDatasetREPL ++ Seq(
    libraryDependencies += "eu.timepit" %% "refined" % refinedVersion
  )

lazy val mlSettings = framelessSettings ++ framelessTypedDatasetREPL

lazy val scalac212Options = Seq(
  "-Xlint:-missing-interpolator,-unused,_",
  "-target:jvm-1.8",
  "-deprecation",
  "-encoding",
  "UTF-8",
  "-feature",
  "-unchecked",
  "-Xfatal-warnings",
  "-Yno-adapted-args",
  "-Ywarn-dead-code",
  "-Ywarn-numeric-widen",
  "-Ywarn-unused-import",
  "-Ywarn-value-discard",
  "-language:existentials",
  "-language:implicitConversions",
  "-language:higherKinds",
  "-Xfuture",
  "-Ypartial-unification"
)

lazy val scalac213Options = {
  val exclusions = Set(
    "-Yno-adapted-args",
    "-Ywarn-unused-import",
    "-Xfuture",
    // type TraversableOnce in package scala is deprecated, symbol literal is deprecated; use Symbol("a") instead
    "-Xfatal-warnings",
    "-Ypartial-unification"
  )

  // https://github.com/scala/bug/issues/12072
  val options = Seq("-Xlint:-byname-implicit")
  scalac212Options.filter(s => !exclusions.contains(s)) ++ options
}

lazy val scalacOptionSettings = Def.setting {
  def baseScalacOptions(scalaVersion: String) =
    CrossVersion.partialVersion(scalaVersion) match {
      case Some((2, 13)) => scalac213Options
      case _             => scalac212Options
    }

  baseScalacOptions(scalaVersion.value)
}

lazy val framelessSettings = Seq(
  scalacOptions ++= scalacOptionSettings.value,
  Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oDF"),
  libraryDependencies ++= Seq(
    "com.chuusai" %% "shapeless" % shapeless,
    "org.scalatest" %% "scalatest" % scalatest % Test,
    "org.scalatestplus" %% "scalatestplus-scalacheck" % scalatestplus % Test,
    "org.scalacheck" %% "scalacheck" % scalacheck % Test
  ),
  Test / javaOptions ++= {
    val baseOptions = Seq("-Xmx1G", "-ea")
    val java17Options =
      if (sys.props("java.specification.version").toDouble >= 17.0) {
        Seq(
          "--add-opens=java.base/java.lang=ALL-UNNAMED",
          "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
          "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED",
          "--add-opens=java.base/java.io=ALL-UNNAMED",
          "--add-opens=java.base/java.net=ALL-UNNAMED",
          "--add-opens=java.base/java.nio=ALL-UNNAMED",
          "--add-opens=java.base/java.util=ALL-UNNAMED",
          "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED",
          "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED",
          "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED",
          "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED",
          "--add-opens=java.base/sun.security.action=ALL-UNNAMED",
          "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"
        )
      } else Seq.empty
    baseOptions ++ java17Options
  },
  Test / fork := true,
  Test / parallelExecution := false,
  mimaPreviousArtifacts ~= {
    _.filterNot(_.revision == "0.11.0") // didn't release properly
  },
  /**
   * The old Scala XML is pulled from Scala 2.12.x.
   *
   * [error] (update) found version conflict(s) in library dependencies; some are suspected to be binary incompatible:
   * [error]
   * [error] 	* org.scala-lang.modules:scala-xml_2.12:2.3.0 (early-semver) is selected over 1.0.6
   * [error] 	    +- org.scoverage:scalac-scoverage-reporter_2.12:2.0.7 (depends on 2.4.0)
   * [error] 	    +- org.scala-lang:scala-compiler:2.12.16              (depends on 1.0.6)
   */
  libraryDependencySchemes += "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
) ++ consoleSettings

lazy val spark34Settings = Seq[Setting[_]](
  tlVersionIntroduced := Map("2.12" -> "0.14.1", "2.13" -> "0.14.1"),
  mimaPreviousArtifacts := Set(
    organization.value %% moduleName.value
      .split("-")
      .dropRight(1)
      .mkString("-") % "0.14.1"
  )
)

lazy val spark33Settings = Seq[Setting[_]](
  tlVersionIntroduced := Map("2.12" -> "0.13.0", "2.13" -> "0.13.0"),
  mimaPreviousArtifacts := Set(
    organization.value %% moduleName.value
      .split("-")
      .dropRight(1)
      .mkString("-") % "0.14.0"
  )
)

lazy val consoleSettings = Seq(
  Compile / console / scalacOptions ~= {
    _.filterNot("-Ywarn-unused-import" == _)
  },
  Test / console / scalacOptions := (Compile / console / scalacOptions).value
)

lazy val framelessTypedDatasetREPL = Seq(
  initialize ~= { _ => // Color REPL
    val ansi = System.getProperty("sbt.log.noformat", "false") != "true"
    if (ansi) System.setProperty("scala.color", "true")
  },
  console / initialCommands :=
    """
      |import org.apache.spark.{SparkConf, SparkContext}
      |import org.apache.spark.sql.SparkSession
      |import frameless.functions.aggregate._
      |import frameless.syntax._
      |
      |val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false")
      |implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
      |
      |import spark.implicits._
      |
      |spark.sparkContext.setLogLevel("WARN")
      |
      |import frameless.TypedDataset
    """.stripMargin,
  console / cleanupCommands :=
    """
      |spark.stop()
    """.stripMargin
)

ThisBuild / organization := "org.typelevel"
ThisBuild / licenses := List(
  "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")
)
ThisBuild / developers := List(
  "OlivierBlanvillain" -> "Olivier Blanvillain",
  "adelbertc" -> "Adelbert Chang",
  "imarios" -> "Marios Iliofotou",
  "kanterov" -> "Gleb Kanterov",
  "non" -> "Erik Osheim",
  "jeremyrsmith" -> "Jeremy Smith",
  "cchantep" -> "Cédric Chantepie",
  "pomadchin" -> "Grigory Pomadchin"
).map {
  case (username, fullName) =>
    tlGitHubDev(username, fullName)
}

ThisBuild / tlCiReleaseBranches := Seq("master")
ThisBuild / tlSitePublishBranch := Some("master")

val roots = List("root-spark33", "root-spark34", "root-spark35")

ThisBuild / githubWorkflowBuildMatrixAdditions += "project" -> roots

ThisBuild / githubWorkflowBuildMatrixExclusions ++= roots.init.map { project =>
  MatrixExclude(Map("scala" -> "2.13", "project" -> project))
}

ThisBuild / githubWorkflowEnv += "SBT_OPTS" -> "-Xms1g -Xmx4g"


================================================
FILE: cats/src/main/scala/frameless/cats/FramelessSyntax.scala
================================================
package frameless
package cats

import _root_.cats.effect.Sync
import _root_.cats.syntax.all._
import _root_.cats.mtl.Ask
import org.apache.spark.sql.SparkSession

trait FramelessSyntax extends frameless.FramelessSyntax {
  implicit class SparkJobOps[F[_], A](fa: F[A])(implicit S: Sync[F], A: Ask[F, SparkSession]) {
    import S._, A._

    def withLocalProperty(key: String, value: String): F[A] =
      for {
        session <- ask
        _       <- delay(session.sparkContext.setLocalProperty(key, value))
        a       <- fa
      } yield a

    def withGroupId(groupId: String): F[A] = withLocalProperty("spark.jobGroup.id", groupId)

    def withDescription(description: String): F[A] = withLocalProperty("spark.job.description", description)
  }
}


================================================
FILE: cats/src/main/scala/frameless/cats/SparkDelayInstances.scala
================================================
package frameless
package cats

import _root_.cats.effect.Sync
import org.apache.spark.sql.SparkSession

trait SparkDelayInstances {
  implicit def framelessCatsSparkDelayForSync[F[_]](implicit S: Sync[F]): SparkDelay[F] = new SparkDelay[F] {
    def delay[A](a: => A)(implicit spark: SparkSession): F[A] = S.delay(a)
  }
}


================================================
FILE: cats/src/main/scala/frameless/cats/SparkTask.scala
================================================
package frameless
package cats

import _root_.cats.Id
import _root_.cats.data.Kleisli
import org.apache.spark.SparkContext

object SparkTask {
  def apply[A](f: SparkContext => A): SparkTask[A] =
    Kleisli[Id, SparkContext, A](f)

  def pure[A](a: => A): SparkTask[A] =
    Kleisli[Id, SparkContext, A](_ => a)
}


================================================
FILE: cats/src/main/scala/frameless/cats/implicits.scala
================================================
package frameless
package cats

import _root_.cats._
import _root_.cats.kernel.{CommutativeMonoid, CommutativeSemigroup}
import _root_.cats.syntax.all._
import alleycats.Empty

import scala.reflect.ClassTag
import org.apache.spark.rdd.RDD

object implicits extends FramelessSyntax with SparkDelayInstances {
  implicit class rddOps[A: ClassTag](lhs: RDD[A]) {
    def csum(implicit m: CommutativeMonoid[A]): A =
      lhs.fold(m.empty)(_ |+| _)
    def csumOption(implicit m: CommutativeSemigroup[A]): Option[A] =
      lhs.aggregate[Option[A]](None)(
        (acc, a) => Some(acc.fold(a)(_ |+| a)),
        (l, r) => l.fold(r)(x => r.map(_ |+| x) orElse Some(x))
      )

    def cmin(implicit o: Order[A], e: Empty[A]): A = {
      if (lhs.isEmpty()) e.empty
      else lhs.reduce(_ min _)
    }
    def cminOption(implicit o: Order[A]): Option[A] =
      csumOption(new CommutativeSemigroup[A] {
        def combine(l: A, r: A) = l min r
      })

    def cmax(implicit o: Order[A], e: Empty[A]): A = {
      if (lhs.isEmpty()) e.empty
      else lhs.reduce(_ max _)
    }
    def cmaxOption(implicit o: Order[A]): Option[A] =
      csumOption(new CommutativeSemigroup[A] {
        def combine(l: A, r: A) = l max r
      })
  }

  implicit class pairRddOps[K: ClassTag, V: ClassTag](lhs: RDD[(K, V)]) {
    def csumByKey(implicit m: CommutativeSemigroup[V]): RDD[(K, V)] = lhs.reduceByKey(_ |+| _)
    def cminByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ min _)
    def cmaxByKey(implicit o: Order[V]): RDD[(K, V)] = lhs.reduceByKey(_ max _)
  }
}

object union {
  implicit def unionSemigroup[A]: Semigroup[RDD[A]] =
    new Semigroup[RDD[A]] {
      def combine(lhs: RDD[A], rhs: RDD[A]): RDD[A] = lhs union rhs
    }
}

object inner {
  implicit def pairwiseInnerSemigroup[K: ClassTag, V: ClassTag: Semigroup]: Semigroup[RDD[(K, V)]] =
    new Semigroup[RDD[(K, V)]] {
      def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] =
        lhs.join(rhs).mapValues { case (x, y) => x |+| y }
    }
}

object outer {
  implicit def pairwiseOuterSemigroup[K: ClassTag, V: ClassTag](implicit m: Monoid[V]): Semigroup[RDD[(K, V)]] =
    new Semigroup[RDD[(K, V)]] {
      def combine(lhs: RDD[(K, V)], rhs: RDD[(K, V)]): RDD[(K, V)] =
        lhs.fullOuterJoin(rhs).mapValues {
          case (Some(x), Some(y)) => x |+| y
          case (None, Some(y)) => y
          case (Some(x), None) => x
          case (None, None) => m.empty
        }
    }
}


================================================
FILE: cats/src/main/scala/frameless/cats/package.scala
================================================
package frameless

import _root_.cats.Id
import _root_.cats.data.Kleisli
import org.apache.spark.SparkContext

package object cats {
  type SparkTask[A] = Kleisli[Id, SparkContext, A]
}


================================================
FILE: cats/src/test/resources/log4j.properties
================================================
log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR
log4j.logger.akka.event.slf4j=ERROR
log4j.logger.akka.remote.EndpointWriter=ERROR
log4j.logger.akka.remote.RemoteActorRefProvider$RemotingTerminator=ERROR
log4j.logger.com.anjuke.dm=ERROR
log4j.logger.io.netty.bootstrap.ServerBootstrap=ERROR
log4j.logger.io.netty.buffer.ByteBufUtil=ERROR
log4j.logger.io.netty.buffer.PooledByteBufAllocator=ERROR
log4j.logger.io.netty.channel.AbstractChannel=ERROR
log4j.logger.io.netty.channel.ChannelInitializer=ERROR
log4j.logger.io.netty.channel.ChannelOutboundBuffer=ERROR
log4j.logger.io.netty.channel.DefaultChannelPipeline=ERROR
log4j.logger.io.netty.channel.MultithreadEventLoopGroup=ERROR
log4j.logger.io.netty.channel.nio.AbstractNioChannel=ERROR
log4j.logger.io.netty.channel.nio.NioEventLoop=ERROR
log4j.logger.io.netty.channel.socket.nio.NioServerSocketChannel=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise.rejectedExecution=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise=ERROR
log4j.logger.io.netty.util.concurrent.GlobalEventExecutor=ERROR
log4j.logger.io.netty.util.concurrent.SingleThreadEventExecutor=ERROR
log4j.logger.io.netty.util.internal.logging.InternalLoggerFactory=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent0=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent=ERROR
log4j.logger.io.netty.util.internal.SystemPropertyUtil=ERROR
log4j.logger.io.netty.util.internal.ThreadLocalRandom=ERROR
log4j.logger.io.netty.util.NetUtil=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration=ERROR
log4j.logger.org.apache.hadoop.fs.FileSystem=ERROR
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=ERROR
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
log4j.logger.org.apache.hadoop.mapred.JobConf=ERROR
log4j.logger.org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedPartitioner=ERROR
log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.Interns=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MetricsSourceBuilder=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MutableMetricsFactory=ERROR
log4j.logger.org.apache.hadoop.security.authentication.util.KerberosName=ERROR
log4j.logger.org.apache.hadoop.security.Groups=ERROR
log4j.logger.org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback=ERROR
log4j.logger.org.apache.hadoop.security.SecurityUtil=ERROR
log4j.logger.org.apache.hadoop.security.ShellBasedUnixGroupsMapping=ERROR
log4j.logger.org.apache.hadoop.security.UserGroupInformation=ERROR
log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
log4j.logger.org.apache.hadoop.util.ShutdownHookManager=ERROR
log4j.logger.org.apache.spark.broadcast.TorrentBroadcast=ERROR
log4j.logger.org.apache.spark.ContextCleaner=ERROR
log4j.logger.org.apache.spark.executor.Executor=ERROR
log4j.logger.org.apache.spark.HeartbeatReceiver=ERROR
log4j.logger.org.apache.spark.HttpFileServer=ERROR
log4j.logger.org.apache.spark.HttpServer=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMaster=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.metrics.MetricsSystem=ERROR
log4j.logger.org.apache.spark.network.client.TransportClientFactory=ERROR
log4j.logger.org.apache.spark.network.netty.NettyBlockTransferService=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageDecoder=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageEncoder=ERROR
log4j.logger.org.apache.spark.network.server.OneForOneStreamManager=ERROR
log4j.logger.org.apache.spark.network.server.TransportServer=ERROR
log4j.logger.org.apache.spark.network.TransportContext=ERROR
log4j.logger.org.apache.spark.network.util.JavaUtils=ERROR
log4j.logger.org.apache.spark.rdd.CoGroupedRDD=ERROR
log4j.logger.org.apache.spark.rdd.SubtractedRDD=ERROR
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
log4j.logger.org.apache.spark.rpc.akka.AkkaRpcEnv$$anonfun$actorRef$lzycompute$1$1$$anon$1=ERROR
log4j.logger.org.apache.spark.scheduler.DAGScheduler=ERROR
log4j.logger.org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSchedulerImpl=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR
log4j.logger.org.apache.spark.SecurityManager=ERROR
log4j.logger.org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter=ERROR
log4j.logger.org.apache.spark.SparkContext=ERROR
log4j.logger.org.apache.spark.SparkEnv=ERROR
log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.SortBasedAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.TungstenAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.Exchange=ERROR
log4j.logger.org.apache.spark.sql.execution.joins.ShuffledHashOuterJoin=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$1=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$2=ERROR
log4j.logger.org.apache.spark.SSLOptions=ERROR
log4j.logger.org.apache.spark.storage.BlockManager=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerInfo=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMaster=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerSlaveEndpoint=ERROR
log4j.logger.org.apache.spark.storage.DiskBlockManager=ERROR
log4j.logger.org.apache.spark.storage.MemoryStore=ERROR
log4j.logger.org.apache.spark.storage.ShuffleBlockFetcherIterator=ERROR
log4j.logger.org.apache.spark.ui.SparkUI=ERROR
log4j.logger.org.apache.spark.unsafe.map.BytesToBytesMap=ERROR
log4j.logger.org.apache.spark.unsafe.memory.TaskMemoryManager=ERROR
log4j.logger.org.apache.spark.util.AkkaUtils=ERROR
log4j.logger.org.apache.spark.util.ClosureCleaner=ERROR
log4j.logger.org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter=ERROR
log4j.logger.org.apache.spark.util.Utils=ERROR
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.spark-project.jetty.http.AbstractGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.HttpGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.MimeTypes=ERROR
log4j.logger.org.spark-project.jetty.io.AbstractBuffer=ERROR
log4j.logger.org.spark-project.jetty.io.nio=ERROR
log4j.logger.org.spark-project.jetty.server.AbstractConnector=ERROR
log4j.logger.org.spark-project.jetty.server.bio.SocketConnector=ERROR
log4j.logger.org.spark-project.jetty.server.handler.AbstractHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandlerCollection=ERROR
log4j.logger.org.spark-project.jetty.server.handler.DefaultHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ErrorHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.GzipHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ResourceHandler=ERROR
log4j.logger.org.spark-project.jetty.server.Server=ERROR
log4j.logger.org.spark-project.jetty.server=ERROR
log4j.logger.org.spark-project.jetty.servlet.DefaultServlet=ERROR
log4j.logger.org.spark-project.jetty.servlet.Holder=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHandler=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHolder=ERROR
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.AggregateLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.Container=ERROR
log4j.logger.org.spark-project.jetty.util.IO=ERROR
log4j.logger.org.spark-project.jetty.util.log=ERROR
log4j.logger.org.spark-project.jetty.util.resource.FileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarFileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.Resource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.URLResource=ERROR
log4j.logger.org.spark-project.jetty.util.StringUtil=ERROR
log4j.logger.org.spark-project.jetty.util.thread.QueuedThreadPool=ERROR
log4j.logger.org.spark-project.jetty.util.thread.Timeout=ERROR
log4j.logger.org.spark-project.jetty=ERROR
log4j.logger.Remoting=ERROR


================================================
FILE: cats/src/test/resources/log4j2.properties
================================================
# Set to debug or trace if log4j initialization is failing
status = warn

# Name of the configuration
name = ConsoleAppender

# Console appender configuration
appender.console.type = Console
appender.console.name = consoleLogger
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{YYYY-MM-dd HH:mm:ss} [%t] %-5p %c:%L - %m%n
appender.console.target = SYSTEM_OUT

# Root logger level
rootLogger.level = error

# Root logger referring to console appender
rootLogger.appenderRef.stdout.ref = consoleLogger

logger.spark.name = org.apache.spark
logger.spark.level = warn

logger.hadoop.name = org.apache.hadoop
logger.hadoop.level = warn


================================================
FILE: cats/src/test/scala/frameless/cats/FramelessSyntaxTests.scala
================================================
package frameless
package cats

import _root_.cats.data.ReaderT
import _root_.cats.effect.IO
import _root_.cats.effect.unsafe.implicits.global
import org.apache.spark.sql.SparkSession
import org.scalatest.matchers.should.Matchers
import org.scalacheck.{Test => PTest}
import org.scalacheck.Prop, Prop._
import org.scalacheck.effect.PropF, PropF._

class FramelessSyntaxTests extends TypedDatasetSuite with Matchers {
  override val sparkDelay = null

  def prop[A, B](data: Vector[X2[A, B]])(
    implicit ev: TypedEncoder[X2[A, B]]
  ): Prop = {
    import implicits._

    val dataset = TypedDataset.create(data).dataset
    val dataframe = dataset.toDF()

    val typedDataset = dataset.typed
    val typedDatasetFromDataFrame = dataframe.unsafeTyped[X2[A, B]]

    typedDataset.collect[IO]().unsafeRunSync().toVector ?= typedDatasetFromDataFrame.collect[IO]().unsafeRunSync().toVector
  }

  test("dataset typed - toTyped") {
    check(forAll(prop[Int, String] _))
  }

  test("properties can be read back") {
    import implicits._
    import _root_.cats.syntax.all._

    forAllF { (k: String, v: String) =>
      val scopedKey = "frameless.tests." + k
      1
        .pure[ReaderT[IO, SparkSession, *]]
        .withLocalProperty(scopedKey, v)
        .withGroupId(v)
        .withDescription(v)
        .run(session)
        .map { _ =>
          sc.getLocalProperty(scopedKey) shouldBe v
          sc.getLocalProperty("spark.jobGroup.id") shouldBe v
          sc.getLocalProperty("spark.job.description") shouldBe v
        }.void
    }.check().unsafeRunSync().status shouldBe PTest.Passed
  }
}


================================================
FILE: cats/src/test/scala/frameless/cats/test.scala
================================================
package frameless
package cats

import _root_.cats.Foldable
import _root_.cats.syntax.all._

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext => SC}

import org.scalatest.compatible.Assertion
import org.scalactic.anyvals.PosInt
import org.scalacheck.Arbitrary
import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
import Arbitrary._

import scala.collection.immutable.SortedMap
import scala.reflect.ClassTag
import org.scalatest.matchers.should.Matchers
import org.scalatest.propspec.AnyPropSpec

trait SparkTests {
  val appID: String = new java.util.Date().toString + math.floor(math.random() * 10E4).toLong.toString

  val conf: SparkConf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("test")
    .set("spark.ui.enabled", "false")
    .set("spark.app.id", appID)

  implicit def session: SparkSession = SparkSession.builder().config(conf).getOrCreate()
  implicit def sc: SparkContext = session.sparkContext

  implicit class seqToRdd[A: ClassTag](seq: Seq[A])(implicit sc: SC) {
    def toRdd: RDD[A] = sc.makeRDD(seq)
  }
}

object Tests {
  def innerPairwise(mx: Map[String, Int], my: Map[String, Int], check: (Any, Any) => Assertion)(implicit sc: SC): Assertion = {
    import frameless.cats.implicits._
    import frameless.cats.inner._
    val xs = sc.parallelize(mx.toSeq)
    val ys = sc.parallelize(my.toSeq)

    val mz0 = (xs |+| ys).collectAsMap()
    val mz1 = (xs join ys).mapValues { case (x, y) => x |+| y }.collectAsMap()
    val mz2 = (for { (k, x) <- mx; y <- my.get(k) } yield (k, x + y)).toMap
    check(mz0, mz1)
    check(mz1, mz2)

    val zs = sc.parallelize(mx.values.toSeq)
    check(xs.csumByKey.collectAsMap(), mx)
    check(zs.csum, zs.collect().sum)

    if (mx.nonEmpty) {
      check(xs.cminByKey.collectAsMap(), mx)
      check(xs.cmaxByKey.collectAsMap(), mx)
      check(zs.cmin, zs.collect().min)
      check(zs.cmax, zs.collect().max)
    } else check(1, 1)
  }
}

class Test extends AnyPropSpec with Matchers with ScalaCheckPropertyChecks with SparkTests {
  implicit override val generatorDrivenConfig =
    PropertyCheckConfiguration(minSize = PosInt(10))

  property("spark is working") {
    sc.parallelize(Seq(1, 2, 3)).collect() shouldBe Array(1,2,3)
  }

  property("inner pairwise monoid") {
    // Make sure we have non-empty map
    forAll { (xh: (String, Int), mx: Map[String, Int], yh: (String, Int), my: Map[String, Int]) =>
      Tests.innerPairwise(mx + xh, my + yh, _ shouldBe _)
    }
  }

  property("rdd simple numeric commutative semigroup") {
    import frameless.cats.implicits._

    forAll { seq: List[Int] =>
      val expectedSum = if (seq.isEmpty) None else Some(seq.sum)
      val expectedMin = if (seq.isEmpty) None else Some(seq.min)
      val expectedMax = if (seq.isEmpty) None else Some(seq.max)

      val rdd = seq.toRdd

      rdd.cmin shouldBe expectedMin.getOrElse(0)
      rdd.cminOption shouldBe expectedMin

      rdd.cmax shouldBe expectedMax.getOrElse(0)
      rdd.cmaxOption shouldBe expectedMax

      rdd.csum shouldBe expectedSum.getOrElse(0)
      rdd.csumOption shouldBe expectedSum
    }
  }

  property("rdd of SortedMap[Int,Int] commutative monoid") {
    import frameless.cats.implicits._
    forAll { seq: List[SortedMap[Int, Int]] =>
      val rdd = seq.toRdd
      rdd.csum shouldBe Foldable[List].fold(seq)
    }
  }

  property("rdd tuple commutative semigroup example") {
    import frameless.cats.implicits._
    forAll { seq: List[(Int, Int)] =>
      val expectedSum = if (seq.isEmpty) None else Some(Foldable[List].fold(seq))
      val rdd = seq.toRdd

      rdd.csum shouldBe expectedSum.getOrElse(0 -> 0)
      rdd.csumOption shouldBe expectedSum
    }
  }

  property("pair rdd numeric commutative semigroup example") {
    import frameless.cats.implicits._
    val seq = Seq( ("a",2), ("b",3), ("d",6), ("b",2), ("d",1) )
    val rdd = seq.toRdd
    rdd.cminByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",2), ("d",1) )
    rdd.cmaxByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",3), ("d",6) )
    rdd.csumByKey.collect().toSeq should contain theSameElementsAs Seq( ("a",2), ("b",5), ("d",7) )
  }
}


================================================
FILE: core/src/main/scala/frameless/CatalystAverageable.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/**
  * When averaging Spark doesn't change these types:
  * - BigDecimal -> BigDecimal
  * - Double     -> Double
  * But it changes these types :
  * - Int        -> Double
  * - Short      -> Double
  * - Long       -> Double
  */
@implicitNotFound("Cannot compute average of type ${In}.")
trait CatalystAverageable[In, Out]

object CatalystAverageable {
  private[this] val theInstance = new CatalystAverageable[Any, Any] {}
  private[this] def of[In, Out]: CatalystAverageable[In, Out] = theInstance.asInstanceOf[CatalystAverageable[In, Out]]

  implicit val framelessAverageableBigDecimal: CatalystAverageable[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal]
  implicit val framelessAverageableDouble:     CatalystAverageable[Double, Double]         = of[Double, Double]
  implicit val framelessAverageableLong:       CatalystAverageable[Long, Double]           = of[Long, Double]
  implicit val framelessAverageableInt:        CatalystAverageable[Int, Double]            = of[Int, Double]
  implicit val framelessAverageableShort:      CatalystAverageable[Short, Double]          = of[Short, Double]
}


================================================
FILE: core/src/main/scala/frameless/CatalystBitShift.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Spark does not return always Int on shift
  */

@implicitNotFound("Cannot do bit shift operations on columns of type ${In}.")
trait CatalystBitShift[In, Out]

object CatalystBitShift {
  private[this] val theInstance = new CatalystBitShift[Any, Any] {}
  private[this] def of[In, Out]: CatalystBitShift[In, Out] = theInstance.asInstanceOf[CatalystBitShift[In, Out]]

  implicit val framelessBitShiftBigDecimal: CatalystBitShift[BigDecimal, Int]     = of[BigDecimal, Int]
  implicit val framelessBitShiftDouble    : CatalystBitShift[Byte, Int]           = of[Byte, Int]
  implicit val framelessBitShiftInt       : CatalystBitShift[Short, Int]          = of[Short, Int]
  implicit val framelessBitShiftLong      : CatalystBitShift[Int, Int]            = of[Int, Int]
  implicit val framelessBitShiftShort     : CatalystBitShift[Long, Long]          = of[Long, Long]
}


================================================
FILE: core/src/main/scala/frameless/CatalystBitwise.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/**
 * Types that can be bitwise ORed, ANDed, or XORed by Catalyst.
 * Note that Catalyst requires that when performing bitwise operations between columns
 * the two types must be the same so in some cases casting is necessary.
 */
@implicitNotFound("Cannot do bitwise operations on columns of type ${A}.")
trait CatalystBitwise[A] extends CatalystNumeric[A]

object CatalystBitwise {
  private[this] val theInstance = new CatalystBitwise[Any] {}

  private[this] def of[A]: CatalystBitwise[A] =
    theInstance.asInstanceOf[CatalystBitwise[A]]

  implicit val framelessbyteBitwise: CatalystBitwise[Byte] = of[Byte]
  implicit val framelessshortBitwise: CatalystBitwise[Short] = of[Short]
  implicit val framelessintBitwise: CatalystBitwise[Int] = of[Int]
  implicit val framelesslongBitwise: CatalystBitwise[Long] = of[Long]
}


================================================
FILE: core/src/main/scala/frameless/CatalystCast.scala
================================================
package frameless

trait CatalystCast[A, B]

object CatalystCast {
  private[this] val theInstance = new CatalystCast[Any, Any] {}
  private[this] def of[A, B]: CatalystCast[A, B] = theInstance.asInstanceOf[CatalystCast[A, B]]

  implicit def framelessCastToString[T]: CatalystCast[T, String] = of[T, String]

  implicit def framelessNumericToLong   [A: CatalystNumeric]: CatalystCast[A, Long]       = of[A, Long]
  implicit def framelessNumericToInt    [A: CatalystNumeric]: CatalystCast[A, Int]        = of[A, Int]
  implicit def framelessNumericToShort  [A: CatalystNumeric]: CatalystCast[A, Short]      = of[A, Short]
  implicit def framelessNumericToByte   [A: CatalystNumeric]: CatalystCast[A, Byte]       = of[A, Byte]
  implicit def framelessNumericToDecimal[A: CatalystNumeric]: CatalystCast[A, BigDecimal] = of[A, BigDecimal]
  implicit def framelessNumericToDouble [A: CatalystNumeric]: CatalystCast[A, Double]     = of[A, Double]

  implicit def framelessBooleanToNumeric[A: CatalystNumeric]: CatalystCast[Boolean, A] = of[Boolean, A]

  // doesn't make any sense to include:
  // - sqlDateToBoolean: always None
  // - sqlTimestampToBoolean: compares us to 0
  implicit val framelessStringToBoolean    : CatalystCast[String, Option[Boolean]] = of[String, Option[Boolean]]
  implicit val framelessLongToBoolean      : CatalystCast[Long, Boolean]           = of[Long, Boolean]
  implicit val framelessIntToBoolean       : CatalystCast[Int, Boolean]            = of[Int, Boolean]
  implicit val framelessShortToBoolean     : CatalystCast[Short, Boolean]          = of[Short, Boolean]
  implicit val framelessByteToBoolean      : CatalystCast[Byte, Boolean]           = of[Byte, Boolean]
  implicit val framelessBigDecimalToBoolean: CatalystCast[BigDecimal, Boolean]     = of[BigDecimal, Boolean]
  implicit val framelessDoubleToBoolean    : CatalystCast[Double, Boolean]         = of[Double, Boolean]

  // TODO

  // needs verification, does it make sense to include? probably better as a separate function
  // implicit object stringToInt extends CatalystCast[String, Option[Int]]
  // implicit object stringToShort extends CatalystCast[String, Option[Short]]
  // implicit object stringToByte extends CatalystCast[String, Option[Byte]]
  // implicit object stringToDecimal extends CatalystCast[String, Option[BigDecimal]]
  // implicit object stringToLong extends CatalystCast[String, Option[Long]]
  // implicit object stringToSqlDate extends CatalystCast[String, Option[SQLDate]]


  // needs verification:
  //implicit object sqlTimestampToSqlDate extends CatalystCast[SQLTimestamp, SQLDate]

  // needs verification:
  // implicit object sqlTimestampToDecimal extends CatalystCast[SQLTimestamp, BigDecimal]
  // implicit object sqlTimestampToLong extends CatalystCast[SQLTimestamp, Long]

  // needs verification:
  // implicit object stringToSqlTimestamp extends CatalystCast[String, SQLTimestamp]
  // implicit object longToSqlTimestamp extends CatalystCast[Long, SQLTimestamp]
  // implicit object intToSqlTimestamp extends CatalystCast[Int, SQLTimestamp]
  // implicit object doubleToSqlTimestamp extends CatalystCast[Double, SQLTimestamp]
  // implicit object floatToSqlTimestamp extends CatalystCast[Float, SQLTimestamp]
  // implicit object bigDecimalToSqlTimestamp extends CatalystCast[BigDecimal, SQLTimestamp]
  // implicit object sqlDateToSqlTimestamp extends CatalystCast[SQLDate, SQLTimestamp]

  // doesn't make sense to include:
  // - booleanToSqlTimestamp: 1L or 0L
  // - shortToSqlTimestamp: ???
  // - byteToSqlTimestamp: ???

  // doesn't make sense to include:
  // - sqlDateToLong: always None
  // - sqlDateToInt: always None
  // - sqlDateToInt: always None
  // - sqlDateToInt: always None
  // - sqlDateToInt: always None

  // doesn't make sense to include:
  // - sqlTimestampToInt: useful? can be done through `-> Long -> Int`
  // - sqlTimestampToShort: useful? can be done through `-> Long -> Int`
  // - sqlTimestampToShort: useful? can be done through `-> Long -> Int`

}


================================================
FILE: core/src/main/scala/frameless/CatalystCollection.scala
================================================
package frameless

import scala.annotation.implicitNotFound

@implicitNotFound("Cannot do collection operations on columns of type ${C}.")
trait CatalystCollection[C[_]]

object CatalystCollection {
  private[this] val theInstance = new CatalystCollection[Any] {}
  private[this] def of[A[_]]: CatalystCollection[A] = theInstance.asInstanceOf[CatalystCollection[A]]

  implicit val arrayObject : CatalystCollection[Array]  = of[Array]
  implicit val seqObject   : CatalystCollection[Seq]    = of[Seq]
  implicit val listObject  : CatalystCollection[List]   = of[List]
  implicit val vectorObject: CatalystCollection[Vector] = of[Vector]
}


================================================
FILE: core/src/main/scala/frameless/CatalystDivisible.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Spark divides everything as Double, expect BigDecimals are divided into
  * another BigDecimal, benefiting from some added precision.
  */
@implicitNotFound("Cannot compute division on type ${In}.")
trait CatalystDivisible[In, Out]

object CatalystDivisible {
  private[this] val theInstance = new CatalystDivisible[Any, Any] {}
  private[this] def of[In, Out]: CatalystDivisible[In, Out] = theInstance.asInstanceOf[CatalystDivisible[In, Out]]

  implicit val framelessDivisibleBigDecimal: CatalystDivisible[BigDecimal, BigDecimal] = of[BigDecimal, BigDecimal]
  implicit val framelessDivisibleDouble    : CatalystDivisible[Double, Double]         = of[Double, Double]
  implicit val framelessDivisibleInt       : CatalystDivisible[Int, Double]            = of[Int, Double]
  implicit val framelessDivisibleLong      : CatalystDivisible[Long, Double]           = of[Long, Double]
  implicit val framelessDivisibleByte      : CatalystDivisible[Byte, Double]           = of[Byte, Double]
  implicit val framelessDivisibleShort     : CatalystDivisible[Short, Double]          = of[Short, Double]
}


================================================
FILE: core/src/main/scala/frameless/CatalystIsin.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Types for which we can check if is in */
@implicitNotFound("Cannot do isin operation on columns of type ${A}.")
trait CatalystIsin[A]

object CatalystIsin {
  implicit object framelessBigDecimal extends CatalystIsin[BigDecimal]
  implicit object framelessByte       extends CatalystIsin[Byte]
  implicit object framelessDouble     extends CatalystIsin[Double]
  implicit object framelessFloat      extends CatalystIsin[Float]
  implicit object framelessInt        extends CatalystIsin[Int]
  implicit object framelessLong       extends CatalystIsin[Long]
  implicit object framelessShort      extends CatalystIsin[Short]
  implicit object framelesssString    extends CatalystIsin[String]
}


================================================
FILE: core/src/main/scala/frameless/CatalystNaN.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Spark does NaN check only for these types */
@implicitNotFound("Columns of type ${A} cannot be NaN.")
trait CatalystNaN[A]

object CatalystNaN {
  private[this] val theInstance = new CatalystNaN[Any] {}
  private[this] def of[A]: CatalystNaN[A] = theInstance.asInstanceOf[CatalystNaN[A]]

  implicit val framelessFloatNaN     : CatalystNaN[Float]      = of[Float]
  implicit val framelessDoubleNaN    : CatalystNaN[Double]     = of[Double]
}


================================================
FILE: core/src/main/scala/frameless/CatalystNotNullable.scala
================================================
package frameless

import scala.annotation.implicitNotFound

@implicitNotFound("Cannot find evidence that type ${A} is nullable. Currently, only Option[A] is nullable.")
trait CatalystNullable[A]

object CatalystNullable {
  implicit def optionIsNullable[A]: CatalystNullable[Option[A]] = new CatalystNullable[Option[A]] {}
}

@implicitNotFound("Cannot find evidence that type ${A} is not nullable.")
trait NotCatalystNullable[A]

object NotCatalystNullable {
  implicit def everythingIsNotNullable[A]: NotCatalystNullable[A] = new NotCatalystNullable[A] {}
  implicit def nullableIsNotNotNullable[A: CatalystNullable]: NotCatalystNullable[A] = new NotCatalystNullable[A] {}
}


================================================
FILE: core/src/main/scala/frameless/CatalystNumeric.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Types that can be added, subtracted and multiplied by Catalyst. */
@implicitNotFound("Cannot do numeric operations on columns of type ${A}.")
trait CatalystNumeric[A]

object CatalystNumeric {
  private[this] val theInstance = new CatalystNumeric[Any] {}
  private[this] def of[A]: CatalystNumeric[A] = theInstance.asInstanceOf[CatalystNumeric[A]]

  implicit val framelessbigDecimalNumeric: CatalystNumeric[BigDecimal] = of[BigDecimal]
  implicit val framelessbyteNumeric      : CatalystNumeric[Byte]       = of[Byte]
  implicit val framelessdoubleNumeric    : CatalystNumeric[Double]     = of[Double]
  implicit val framelessintNumeric       : CatalystNumeric[Int]        = of[Int]
  implicit val framelesslongNumeric      : CatalystNumeric[Long]       = of[Long]
  implicit val framelessshortNumeric     : CatalystNumeric[Short]      = of[Short]
}


================================================
FILE: core/src/main/scala/frameless/CatalystNumericWithJavaBigDecimal.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Spark does not return always the same type as the input was for example abs
  */
@implicitNotFound("Cannot compute on type ${In}.")
trait CatalystNumericWithJavaBigDecimal[In, Out]

object CatalystNumericWithJavaBigDecimal {
  private[this] val theInstance = new CatalystNumericWithJavaBigDecimal[Any, Any] {}
  private[this] def of[In, Out]: CatalystNumericWithJavaBigDecimal[In, Out] = theInstance.asInstanceOf[CatalystNumericWithJavaBigDecimal[In, Out]]

  implicit val framelessAbsoluteBigDecimal: CatalystNumericWithJavaBigDecimal[BigDecimal, java.math.BigDecimal]  = of[BigDecimal, java.math.BigDecimal]
  implicit val framelessAbsoluteDouble    : CatalystNumericWithJavaBigDecimal[Double, Double]                    = of[Double, Double]
  implicit val framelessAbsoluteInt       : CatalystNumericWithJavaBigDecimal[Int, Int]                          = of[Int, Int]
  implicit val framelessAbsoluteLong      : CatalystNumericWithJavaBigDecimal[Long, Long]                        = of[Long, Long]
  implicit val framelessAbsoluteShort     : CatalystNumericWithJavaBigDecimal[Short, Short]                      = of[Short, Short]
  implicit val framelessAbsoluteByte      : CatalystNumericWithJavaBigDecimal[Byte, Byte]                        = of[Byte, Byte]

}

================================================
FILE: core/src/main/scala/frameless/CatalystOrdered.scala
================================================
package frameless

import scala.annotation.implicitNotFound
import shapeless.{Generic, HList, Lazy}
import shapeless.ops.hlist.LiftAll
import java.time.{Duration, Instant, Period}

/** Types that can be ordered/compared by Catalyst. */
@implicitNotFound("Cannot compare columns of type ${A}.")
trait CatalystOrdered[A]

object CatalystOrdered {
  private[this] val theInstance = new CatalystOrdered[Any] {}
  private[this] def of[A]: CatalystOrdered[A] = theInstance.asInstanceOf[CatalystOrdered[A]]

  implicit val framelessIntOrdered         : CatalystOrdered[Int]          = of[Int]
  implicit val framelessBooleanOrdered     : CatalystOrdered[Boolean]      = of[Boolean]
  implicit val framelessByteOrdered        : CatalystOrdered[Byte]         = of[Byte]
  implicit val framelessShortOrdered       : CatalystOrdered[Short]        = of[Short]
  implicit val framelessLongOrdered        : CatalystOrdered[Long]         = of[Long]
  implicit val framelessFloatOrdered       : CatalystOrdered[Float]        = of[Float]
  implicit val framelessDoubleOrdered      : CatalystOrdered[Double]       = of[Double]
  implicit val framelessBigDecimalOrdered  : CatalystOrdered[BigDecimal]   = of[BigDecimal]
  implicit val framelessSQLDateOrdered     : CatalystOrdered[SQLDate]      = of[SQLDate]
  implicit val framelessSQLTimestampOrdered: CatalystOrdered[SQLTimestamp] = of[SQLTimestamp]
  implicit val framelessStringOrdered      : CatalystOrdered[String]       = of[String]
  implicit val framelessInstantOrdered     : CatalystOrdered[Instant]      = of[Instant]
  implicit val framelessDurationOrdered    : CatalystOrdered[Duration]     = of[Duration]
  implicit val framelessPeriodOrdered      : CatalystOrdered[Period]       = of[Period]

  implicit def injectionOrdered[A, B]
    (implicit
      i0: Injection[A, B],
      i1: CatalystOrdered[B]
    ): CatalystOrdered[A] = of[A]

  implicit def deriveGeneric[G, H <: HList]
    (implicit
      i0: Generic.Aux[G, H],
      i1: Lazy[LiftAll[CatalystOrdered, H]]
    ): CatalystOrdered[G] = of[G]
}


================================================
FILE: core/src/main/scala/frameless/CatalystPivotable.scala
================================================
package frameless

import scala.annotation.implicitNotFound

@implicitNotFound("Cannot pivot on type ${A}. Currently supported types to pivot are {Int, Long, Boolean, and String}.")
trait CatalystPivotable[A]

object CatalystPivotable {
  private[this] val theInstance = new CatalystPivotable[Any] {}
  private[this] def of[A]: CatalystPivotable[A] = theInstance.asInstanceOf[CatalystPivotable[A]]

  implicit val framelessIntPivotable    : CatalystPivotable[Int]     = of[Int]
  implicit val framelessLongPivotable   : CatalystPivotable[Long]    = of[Long]
  implicit val framelessBooleanPivotable: CatalystPivotable[Boolean] = of[Boolean]
  implicit val framelessStringPivotable : CatalystPivotable[String]  = of[String]
}


================================================
FILE: core/src/main/scala/frameless/CatalystRound.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/** Spark does not return always long on round
  */
@implicitNotFound("Cannot compute round on type ${In}.")
trait CatalystRound[In, Out]

object CatalystRound {
  private[this] val theInstance = new CatalystRound[Any, Any] {}
  private[this] def of[In, Out]: CatalystRound[In, Out] = theInstance.asInstanceOf[CatalystRound[In, Out]]

  implicit val framelessBigDecimal: CatalystRound[BigDecimal, java.math.BigDecimal] = of[BigDecimal, java.math.BigDecimal]
  implicit val framelessDouble    : CatalystRound[Double, Long]                     = of[Double, Long]
  implicit val framelessInt       : CatalystRound[Int, Long]                        = of[Int, Long]
  implicit val framelessLong      : CatalystRound[Long, Long]                       = of[Long, Long]
  implicit val framelessShort     : CatalystRound[Short, Long]                      = of[Short, Long]
}

================================================
FILE: core/src/main/scala/frameless/CatalystSummable.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/**
  * When summing Spark doesn't change these types:
  * - Long       -> Long
  * - BigDecimal -> BigDecimal
  * - Double     -> Double
  *
  * For other types there are conversions:
  * - Int        -> Long
  * - Short      -> Long
  */
@implicitNotFound("Cannot compute sum of type ${In}.")
trait CatalystSummable[In, Out] {
  def zero: In
}

object CatalystSummable {
  def apply[In, Out](zero: In): CatalystSummable[In, Out] = {
    val _zero = zero
    new CatalystSummable[In, Out] { val zero: In = _zero }
  }

  implicit val framelessSummableLong      : CatalystSummable[Long, Long]             = CatalystSummable(zero = 0L)
  implicit val framelessSummableBigDecimal: CatalystSummable[BigDecimal, BigDecimal] = CatalystSummable(zero = BigDecimal(0))
  implicit val framelessSummableDouble    : CatalystSummable[Double, Double]         = CatalystSummable(zero = 0.0)
  implicit val framelessSummableInt       : CatalystSummable[Int, Long]              = CatalystSummable(zero = 0)
  implicit val framelessSummableShort     : CatalystSummable[Short, Long]            = CatalystSummable(zero = 0)
}


================================================
FILE: core/src/main/scala/frameless/CatalystVariance.scala
================================================
package frameless

import scala.annotation.implicitNotFound

/**
  * Spark's variance and stddev functions always return Double
  */
@implicitNotFound("Cannot compute variance on type ${A}.")
trait CatalystVariance[A]

object CatalystVariance {
  private[this] val theInstance = new CatalystVariance[Any] {}
  private[this] def of[A]: CatalystVariance[A] = theInstance.asInstanceOf[CatalystVariance[A]]

  implicit val framelessIntVariance       : CatalystVariance[Int]        = of[Int]
  implicit val framelessLongVariance      : CatalystVariance[Long]       = of[Long]
  implicit val framelessShortVariance     : CatalystVariance[Short]      = of[Short]
  implicit val framelessBigDecimalVariance: CatalystVariance[BigDecimal] = of[BigDecimal]
  implicit val framelessDoubleVariance    : CatalystVariance[Double]     = of[Double]
}


================================================
FILE: core/src/main/scala/frameless/Injection.scala
================================================
package frameless

/**
 * An Injection[A, B] is a reversible function from A to B.
 *
 * Must obey `forAll { a: A => invert(apply(a)) == a }`.
 */
trait Injection[A, B] extends Serializable {
  def apply(a: A): B
  def invert(b: B): A
}

object Injection {

  def apply[A, B](f: A => B, g: B => A): Injection[A, B] = new Injection[A, B] {
    def apply(a: A): B = f(a)
    def invert(b: B): A = g(b)
  }
}


================================================
FILE: core/src/main/scala/frameless/SQLDate.scala
================================================
package frameless

/**
 * Type for the internal Spark representation of SQL date. If the `spark.sql.functions` where typed,
 * [date_add][1] would for instance be defined as `def date_add(d: SQLDate, i: Int); SQLDate`.
 *
 * [1]: https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/functions.html#add_months(org.apache.spark.sql.Column,%20int)
 */
case class SQLDate(days: Int)


================================================
FILE: core/src/main/scala/frameless/SQLTimestamp.scala
================================================
package frameless

/**
 * Type for the Spark internal representation of a timestamp. If the `spark.sql.functions` where typed,
 * [current_timestamp][1] would for instance be defined as `def current_timestamp(): SQLTimestamp`.
 *
 * [1]: https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html#current_timestamp()
 */
case class SQLTimestamp(us: Long)


================================================
FILE: dataset/src/main/scala/frameless/FramelessSyntax.scala
================================================
package frameless

import org.apache.spark.sql.{Column, DataFrame, Dataset}

trait FramelessSyntax {
  implicit class ColumnSyntax(self: Column) {
    def typedColumn[T, U: TypedEncoder]: TypedColumn[T, U] = new TypedColumn[T, U](self)
    def typedAggregate[T, U: TypedEncoder]: TypedAggregate[T, U] = new TypedAggregate[T, U](self)
  }

  implicit class DatasetSyntax[T: TypedEncoder](self: Dataset[T]) {
    def typed: TypedDataset[T] = TypedDataset.create[T](self)
  }

  implicit class DataframeSyntax(self: DataFrame){
    def unsafeTyped[T: TypedEncoder]: TypedDataset[T] = TypedDataset.createUnsafe(self)
  }
}


================================================
FILE: dataset/src/main/scala/frameless/InjectionEnum.scala
================================================
package frameless

import shapeless._

trait InjectionEnum {
  implicit val cnilInjectionEnum: Injection[CNil, String] =
    Injection(
      // $COVERAGE-OFF$No value of type CNil so impossible to test
      _ => throw new Exception("Impossible"),
      // $COVERAGE-ON$
      name =>
        throw new IllegalArgumentException(
          s"Cannot construct a value of type CNil: $name did not match data constructor names"
        )
    )

  implicit def coproductInjectionEnum[H, T <: Coproduct](
    implicit
    typeable: Typeable[H] ,
    gen: Generic.Aux[H, HNil],
    tInjectionEnum: Injection[T, String]
    ): Injection[H :+: T, String] = {
    val dataConstructorName = typeable.describe.takeWhile(_ != '.')

    Injection(
      {
        case Inl(_) => dataConstructorName
        case Inr(t) => tInjectionEnum.apply(t)
      },
      { name =>
        if (name == dataConstructorName)
          Inl(gen.from(HNil))
        else
          Inr(tInjectionEnum.invert(name))
      }
    )
  }

  implicit def genericInjectionEnum[A, R](
    implicit
    gen: Generic.Aux[A, R],
    rInjectionEnum: Injection[R, String]
    ): Injection[A, String] =
    Injection(
      value => rInjectionEnum(gen.to(value)),
      name => gen.from(rInjectionEnum.invert(name))
    )
}


================================================
FILE: dataset/src/main/scala/frameless/IsValueClass.scala
================================================
package frameless

import shapeless._
import shapeless.labelled.FieldType

/** Evidence that `T` is a Value class */
@annotation.implicitNotFound(msg = "${T} is not a Value class")
final class IsValueClass[T] private() {}

object IsValueClass {
  /** Provides an evidence `A` is a Value class */
  implicit def apply[A <: AnyVal, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil]](
    implicit
      i0: LabelledGeneric.Aux[A, G],
    i1: DropUnitValues.Aux[G, H]): IsValueClass[A] = new IsValueClass[A]

}


================================================
FILE: dataset/src/main/scala/frameless/Job.scala
================================================
package frameless

import org.apache.spark.sql.SparkSession

sealed abstract class Job[A](implicit spark: SparkSession) { self =>
  /** Runs a new Spark job. */
  def run(): A

  def withGroupId(groupId: String): Job[A] = {
    withLocalProperty("spark.jobGroup.id", groupId)
  }

  def withDescription(groupId: String): Job[A] = {
    withLocalProperty("spark.job.description", groupId)
  }

  def withLocalProperty(key: String, value: String): Job[A] = {
    new Job[A] {
      def run(): A = {
        spark.sparkContext.setLocalProperty(key, value)
        self.run()
      }
    }
  }

  def map[B](fn: A => B): Job[B] = new Job[B]()(spark) {
    def run(): B = fn(Job.this.run())
  }

  def flatMap[B](fn: A => Job[B]): Job[B] = new Job[B]()(spark) {
    def run(): B = fn(Job.this.run()).run()
  }
}


object Job {
  def apply[A](a: => A)(implicit spark: SparkSession): Job[A] = new Job[A] {
    def run(): A = a
  }

  implicit val framelessSparkDelayForJob: SparkDelay[Job] = new SparkDelay[Job] {
    def delay[A](a: => A)(implicit spark: SparkSession): Job[A] = Job(a)
  }
}


================================================
FILE: dataset/src/main/scala/frameless/RecordEncoder.scala
================================================
package frameless

import org.apache.spark.sql.FramelessInternals

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects.{
  Invoke, NewInstance, UnwrapOption, WrapOption
}
import org.apache.spark.sql.types._

import shapeless._
import shapeless.labelled.FieldType
import shapeless.ops.hlist.IsHCons
import shapeless.ops.record.Keys

import scala.reflect.ClassTag

case class RecordEncoderField(
  ordinal: Int,
  name: String,
  encoder: TypedEncoder[_]
)

trait RecordEncoderFields[T <: HList] extends Serializable {
  def value: List[RecordEncoderField]

  override def toString: String =
    s"""RecordEncoderFields${value.mkString("[", ", ", "]")}"""
}

object RecordEncoderFields {

  implicit def deriveRecordLast[K <: Symbol, H]
    (implicit
      key: Witness.Aux[K],
      head: RecordFieldEncoder[H]
    ): RecordEncoderFields[FieldType[K, H] :: HNil] = new RecordEncoderFields[FieldType[K, H] :: HNil] {
      def value: List[RecordEncoderField] = fieldEncoder[K, H] :: Nil
    }

  implicit def deriveRecordCons[K <: Symbol, H, T <: HList]
    (implicit
      key: Witness.Aux[K],
      head: RecordFieldEncoder[H],
      tail: RecordEncoderFields[T]
    ): RecordEncoderFields[FieldType[K, H] :: T] = new RecordEncoderFields[FieldType[K, H] :: T] {
      def value: List[RecordEncoderField] =
        fieldEncoder[K, H] :: tail.value.map(x => x.copy(ordinal = x.ordinal + 1))
  }

  private def fieldEncoder[K <: Symbol, H](implicit key: Witness.Aux[K], e: RecordFieldEncoder[H]): RecordEncoderField = RecordEncoderField(0, key.value.name, e.encoder)
}

/**
  * Assists the generation of constructor call parameters from a labelled generic representation.
  * As Unit typed fields were removed earlier, we need to put back unit literals in the  appropriate positions.
  *
  * @tparam T labelled generic representation of type fields
  */
trait NewInstanceExprs[T <: HList] extends Serializable {
  def from(exprs: List[Expression]): Seq[Expression]
}

object NewInstanceExprs {

  implicit def deriveHNil: NewInstanceExprs[HNil] = new NewInstanceExprs[HNil] {
    def from(exprs: List[Expression]): Seq[Expression] = Nil
  }

  implicit def deriveUnit[K <: Symbol, T <: HList]
    (implicit
      tail: NewInstanceExprs[T]
    ): NewInstanceExprs[FieldType[K, Unit] :: T] = new NewInstanceExprs[FieldType[K, Unit] :: T] {
      def from(exprs: List[Expression]): Seq[Expression] =
        Literal.fromObject(()) +: tail.from(exprs)
    }

  implicit def deriveNonUnit[K <: Symbol, V, T <: HList]
    (implicit
      notUnit: V =:!= Unit,
      tail: NewInstanceExprs[T]
    ): NewInstanceExprs[FieldType[K, V] :: T] = new NewInstanceExprs[FieldType[K, V] :: T] {
      def from(exprs: List[Expression]): Seq[Expression] = exprs.head +: tail.from(exprs.tail)
    }
}

/**
  * Drops fields with Unit type from labelled generic representation of types.
  *
  * @tparam L labelled generic representation of type fields
  */
trait DropUnitValues[L <: HList] extends DepFn1[L] with Serializable { type Out <: HList }

object DropUnitValues {
  def apply[L <: HList](implicit dropUnitValues: DropUnitValues[L]): Aux[L, dropUnitValues.Out] = dropUnitValues

  type Aux[L <: HList, Out0 <: HList] = DropUnitValues[L] { type Out = Out0 }

  implicit def deriveHNil[H]: Aux[HNil, HNil] = new DropUnitValues[HNil] {
    type Out = HNil
    def apply(l: HNil): Out = HNil
  }

  implicit def deriveUnit[K <: Symbol, T <: HList, OutT <: HList]
    (implicit
      dropUnitValues : DropUnitValues.Aux[T, OutT]
    ): Aux[FieldType[K, Unit] :: T, OutT] = new DropUnitValues[FieldType[K, Unit] :: T] {
      type Out = OutT
      def apply(l : FieldType[K, Unit] :: T): Out = dropUnitValues(l.tail)
    }

  implicit def deriveNonUnit[K <: Symbol, V, T <: HList, OutH, OutT <: HList]
    (implicit
      nonUnit: V =:!= Unit,
      dropUnitValues : DropUnitValues.Aux[T, OutT]
    ): Aux[FieldType[K, V] :: T, FieldType[K, V] :: OutT] = new DropUnitValues[FieldType[K, V] :: T] {
      type Out = FieldType[K, V] :: OutT
      def apply(l : FieldType[K, V] :: T): Out = l.head :: dropUnitValues(l.tail)
    }
}

class RecordEncoder[F, G <: HList, H <: HList]
  (implicit
    i0: LabelledGeneric.Aux[F, G],
    i1: DropUnitValues.Aux[G, H],
    i2: IsHCons[H],
    fields: Lazy[RecordEncoderFields[H]],
    newInstanceExprs: Lazy[NewInstanceExprs[G]],
    classTag: ClassTag[F]
  ) extends TypedEncoder[F] {
    def nullable: Boolean = false

    def jvmRepr: DataType = FramelessInternals.objectTypeFor[F]

    def catalystRepr: DataType = {
      val structFields = fields.value.value.map { field =>
        StructField(
          name = field.name,
          dataType = field.encoder.catalystRepr,
          nullable = field.encoder.nullable,
          metadata = Metadata.empty
        )
      }

      StructType(structFields)
    }

    def toCatalyst(path: Expression): Expression = {
      val nameExprs = fields.value.value.map { field =>
        Literal(field.name)
      }

      val valueExprs = fields.value.value.map { field =>
        val fieldPath = Invoke(path, field.name, field.encoder.jvmRepr, Nil)
        field.encoder.toCatalyst(fieldPath)
      }

      // the way exprs are encoded in CreateNamedStruct
      val exprs = nameExprs.zip(valueExprs).flatMap {
        case (nameExpr, valueExpr) => nameExpr :: valueExpr :: Nil
      }

      val createExpr = CreateNamedStruct(exprs)
      val nullExpr = Literal.create(null, createExpr.dataType)

      If(IsNull(path), nullExpr, createExpr)
    }

    def fromCatalyst(path: Expression): Expression = {
      val exprs = fields.value.value.map { field =>
        field.encoder.fromCatalyst(
          GetStructField(path, field.ordinal, Some(field.name)))
      }

      val newArgs = newInstanceExprs.value.from(exprs)
      val newExpr = NewInstance(
        classTag.runtimeClass, newArgs, jvmRepr, propagateNull = true)

      val nullExpr = Literal.create(null, jvmRepr)

      If(IsNull(path), nullExpr, newExpr)
    }
}

final class RecordFieldEncoder[T](
  val encoder: TypedEncoder[T],
  private[frameless] val jvmRepr: DataType,
  private[frameless] val fromCatalyst: Expression => Expression,
  private[frameless] val toCatalyst: Expression => Expression
) extends Serializable

object RecordFieldEncoder extends RecordFieldEncoderLowPriority {

  /**
   * @tparam F the value class
   * @tparam G the single field of the value class
   * @tparam H the single field of the value class (with guarantee it's not a `Unit` value)
   * @tparam K the key type for the fields
   * @tparam V the inner value type
   */
  implicit def optionValueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]]
    (implicit
      i0: LabelledGeneric.Aux[F, G],
      i1: DropUnitValues.Aux[G, H],
      i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
      i3: Keys.Aux[H, KS],
      i4: IsHCons.Aux[KS, K, HNil],
      i5: TypedEncoder[V],
      i6: ClassTag[F]
    ): RecordFieldEncoder[Option[F]] = {
      val fieldName = i4.head(i3()).name
      val innerJvmRepr = ObjectType(i6.runtimeClass)

      val catalyst: Expression => Expression = { path =>
        val value = UnwrapOption(innerJvmRepr, path)
        val javaValue = Invoke(value, fieldName, i5.jvmRepr, Nil)

        i5.toCatalyst(javaValue)
      }

      val fromCatalyst: Expression => Expression = { path =>
        val javaValue = i5.fromCatalyst(path)
        val value = NewInstance(i6.runtimeClass, Seq(javaValue), innerJvmRepr)

        WrapOption(value, innerJvmRepr)
      }

      val jvmr = ObjectType(classOf[Option[F]])

      new RecordFieldEncoder[Option[F]](
        encoder = new TypedEncoder[Option[F]] {
          val nullable = true

          val jvmRepr = jvmr

          @inline def catalystRepr: DataType = i5.catalystRepr

          def fromCatalyst(path: Expression): Expression = {
            val javaValue = i5.fromCatalyst(path)
            val value = NewInstance(
              i6.runtimeClass, Seq(javaValue), innerJvmRepr)

            WrapOption(value, innerJvmRepr)
          }

          def toCatalyst(path: Expression): Expression = catalyst(path)

          override def toString: String = s"RecordFieldEncoder.optionValueClass[${i6.runtimeClass.getName}]('${fieldName}', $i5)"
        },
        jvmRepr = jvmr,
        fromCatalyst = fromCatalyst,
        toCatalyst = catalyst
      )
  }

  /**
   * @tparam F the value class
   * @tparam G the single field of the value class
   * @tparam H the single field of the value class (with guarantee it's not a `Unit` value)
   * @tparam V the inner value type
   */
  implicit def valueClass[F : IsValueClass, G <: ::[_, HNil], H <: ::[_ <: FieldType[_ <: Symbol, _], HNil], K <: Symbol, V, KS <: ::[_ <: Symbol, HNil]]
    (implicit
      i0: LabelledGeneric.Aux[F, G],
      i1: DropUnitValues.Aux[G, H],
      i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
      i3: Keys.Aux[H, KS],
      i4: IsHCons.Aux[KS, K, HNil],
      i5: TypedEncoder[V],
      i6: ClassTag[F]
    ): RecordFieldEncoder[F] = {
      val cls = i6.runtimeClass
      val jvmr = i5.jvmRepr
      val fieldName = i4.head(i3()).name

      new RecordFieldEncoder[F](
        encoder = new TypedEncoder[F] {
          def nullable = i5.nullable

          def jvmRepr = jvmr

          def catalystRepr: DataType = i5.catalystRepr

          def fromCatalyst(path: Expression): Expression =
            i5.fromCatalyst(path)

          @inline def toCatalyst(path: Expression): Expression =
            i5.toCatalyst(path)

          override def toString: String = s"RecordFieldEncoder.valueClass[${cls.getName}]('${fieldName}', ${i5})"
        },
        jvmRepr = FramelessInternals.objectTypeFor[F],
        fromCatalyst = { expr: Expression =>
          NewInstance(
            i6.runtimeClass,
            i5.fromCatalyst(expr) :: Nil,
            ObjectType(i6.runtimeClass))
        },
        toCatalyst = { expr: Expression =>
          i5.toCatalyst(Invoke(expr, fieldName, jvmr))
        }
      )
  }
}

private[frameless] sealed trait RecordFieldEncoderLowPriority {
  implicit def apply[T](implicit e: TypedEncoder[T]): RecordFieldEncoder[T] = new RecordFieldEncoder[T](e, e.jvmRepr, e.fromCatalyst, e.toCatalyst)
}


================================================
FILE: dataset/src/main/scala/frameless/SparkDelay.scala
================================================
package frameless

import org.apache.spark.sql.SparkSession

trait SparkDelay[F[_]] {
  def delay[A](a: => A)(implicit spark: SparkSession): F[A]
}


================================================
FILE: dataset/src/main/scala/frameless/TypedColumn.scala
================================================
package frameless

import frameless.functions.{litAggr, lit => flit}
import frameless.syntax._

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.DecimalType
import org.apache.spark.sql.{Column, FramelessInternals}

import shapeless._
import shapeless.ops.record.Selector

import scala.annotation.implicitNotFound
import scala.reflect.ClassTag

import scala.language.experimental.macros

sealed trait UntypedExpression[T] {
  def expr: Expression
  def uencoder: TypedEncoder[_]
  override def toString: String = expr.toString()
}

/** Expression used in `select`-like constructions.
  */
sealed class TypedColumn[T, U](expr: Expression)(
  implicit val uenc: TypedEncoder[U]
) extends AbstractTypedColumn[T, U](expr) {

  type ThisType[A, B] = TypedColumn[A, B]

  def this(column: Column)(implicit uencoder: TypedEncoder[U]) =
    this(FramelessInternals.expr(column))

  override def typed[W, U1: TypedEncoder](c: Column): TypedColumn[W, U1] = c.typedColumn

  override def lit[U1: TypedEncoder](c: U1): TypedColumn[T, U1] = flit(c)
}

/** Expression used in `agg`-like constructions.
  */
sealed class TypedAggregate[T, U](expr: Expression)(
  implicit val uenc: TypedEncoder[U]
) extends AbstractTypedColumn[T, U](expr) {

  type ThisType[A, B] = TypedAggregate[A, B]

  def this(column: Column)(implicit uencoder: TypedEncoder[U]) = {
    this(FramelessInternals.expr(column))
  }

  override def typed[W, U1: TypedEncoder](c: Column): TypedAggregate[W, U1] = c.typedAggregate

  override def lit[U1: TypedEncoder](c: U1): TypedAggregate[T, U1] = litAggr(c)
}

/** Generic representation of a typed column. A typed column can either be a [[TypedAggregate]] or
  * a [[frameless.TypedColumn]].
  *
  * Documentation marked "apache/spark" is thanks to apache/spark Contributors
  * at https://github.com/apache/spark, licensed under Apache v2.0 available at
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * @tparam T phantom type representing the dataset on which this columns is
  *           selected. When `T = A with B` the selection is on either A or B.
  * @tparam U type of column
  */
abstract class AbstractTypedColumn[T, U]
  (val expr: Expression)
  (implicit val uencoder: TypedEncoder[U])
    extends UntypedExpression[T] { self =>

  type ThisType[A, B] <: AbstractTypedColumn[A, B]

  /** A helper class to make to simplify working with Optional fields.
    *
    * {{{
    *    val x: TypedColumn[Option[Int]] = _
    *    x.opt.map(_*2) // This only compiles if the type of x is Option[X] (in this example X is of type Int)
    * }}}
    *
    * @note Known issue: map() will NOT work when the applied function is a udf().
    *       It will compile and then throw a runtime error.
    **/
  trait Mapper[X] {
    def map[G, OutputType[_,_]](u: ThisType[T, X] => OutputType[T,G])
      (implicit
        ev: OutputType[T,G] <:< AbstractTypedColumn[T, G]
      ): OutputType[T, Option[G]] = {
      u(self.asInstanceOf[ThisType[T, X]]).asInstanceOf[OutputType[T, Option[G]]]
    }
  }

  /** Makes it easier to work with Optional columns. It returns an instance of `Mapper[X]`
    * where `X` is type of the unwrapped Optional. E.g., in the case of `Option[Long]`,
    * `X` is of type Long.
    *
    * {{{
    *    val x: TypedColumn[Option[Int]] = _
    *    x.opt.map(_*2)
    * }}}
    * */
  def opt[X](implicit x: U <:< Option[X]): Mapper[X] = new Mapper[X] {}

  /** Fall back to an untyped Column */
  def untyped: Column = new Column(expr)

  private def equalsTo[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] = typed {
    if (uencoder.nullable) EqualNullSafe(self.expr, other.expr)
    else EqualTo(self.expr, other.expr)
  }

  /** Creates a typed column of either TypedColumn or TypedAggregate from an expression. */
  protected def typed[W, U1: TypedEncoder](e: Expression): ThisType[W, U1] =
    typed(new Column(e))

  /** Creates a typed column of either TypedColumn or TypedAggregate. */
  def typed[W, U1: TypedEncoder](c: Column): ThisType[W, U1]

  /** Creates a typed column of either TypedColumn or TypedAggregate. */
  def lit[U1: TypedEncoder](c: U1): ThisType[T, U1]

  /** Equality test.
    * {{{
    *   df.filter( df.col('a) === 1 )
    * }}}
    *
    * apache/spark
    */
  def ===(u: U): ThisType[T, Boolean] =
    equalsTo(lit(u))

  /** Equality test.
    * {{{
    *   df.filter( df.col('a) === df.col('b) )
    * }}}
    *
    * apache/spark
    */
  def ===[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    equalsTo(other)

  /** Inequality test.
    * 
    * {{{
    * df.filter(df.col('a) =!= df.col('b))
    * }}}
    *
    * apache/spark
    */
  def =!=[TT, W](other: ThisType[TT, U])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(Not(equalsTo(other).expr))

  /** Inequality test.
    * 
    * {{{
    * df.filter(df.col('a) =!= "a")
    * }}}
    *
    * apache/spark
    */
  def =!=(u: U): ThisType[T, Boolean] = typed(Not(equalsTo(lit(u)).expr))

  /** True if the current expression is an Option and it's None.
    *
    * apache/spark
    */
  def isNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] =
    typed(IsNull(expr))

  /** True if the current expression is an Option and it's not None.
    *
    * apache/spark
    */
  def isNotNone(implicit i0: U <:< Option[_]): ThisType[T, Boolean] =
    typed(IsNotNull(expr))

  /** True if the current expression is a fractional number and is not NaN.
    *
    * apache/spark
    */
  def isNaN(implicit n: CatalystNaN[U]): ThisType[T, Boolean] =
    typed(self.untyped.isNaN)

  /**
    * True if the value for this optional column `exists` as expected
    * (see `Option.exists`).
    * 
    * {{{
    * df.col('opt).isSome(_ === someOtherCol)
    * }}}
    */
  def isSome[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, false)

  /**
    * True if the value for this optional column `exists` as expected,
    * or is `None`. (see `Option.forall`).
    * 
    * {{{
    * df.col('opt).isSomeOrNone(_ === someOtherCol)
    * }}}
    */
  def isSomeOrNone[V](exists: ThisType[T, V] => ThisType[T, Boolean])(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = someOr[V](exists, true)

  private def someOr[V](exists: ThisType[T, V] => ThisType[T, Boolean], default: Boolean)(implicit i0: U <:< Option[V]): ThisType[T, Boolean] = {
    val defaultExpr = if (default) Literal.TrueLiteral else Literal.FalseLiteral

    typed(Coalesce(Seq(opt(i0).map(exists).expr, defaultExpr)))
  }

  /** Convert an Optional column by providing a default value.
    * 
    * {{{
    * df(df('opt).getOrElse(df('defaultValue)))
    * }}}
    */
  def getOrElse[TT, W, Out](default: ThisType[TT, Out])(implicit i0: U =:= Option[Out], i1: With.Aux[T, TT, W]): ThisType[W, Out] =
    typed(Coalesce(Seq(expr, default.expr)))(default.uencoder)

  /** Convert an Optional column by providing a default value.
    * 
    * {{{
    *   df( df('opt).getOrElse(defaultConstant) )
    * }}}
    */
  def getOrElse[Out: TypedEncoder](default: Out)(implicit i0: U =:= Option[Out]): ThisType[T, Out] =
    getOrElse(lit[Out](default))

  /** Sum of this expression and another expression.
    * 
    * {{{
    *   // The following selects the sum of a person's height and weight.
    *   people.select( people.col('height) plus people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def plus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    typed(self.untyped.plus(other.untyped))

  /** Sum of this expression and another expression.
    * {{{
    *   // The following selects the sum of a person's height and weight.
    *   people.select( people.col('height) + people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def +[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    plus(other)

  /** Sum of this expression (column) with a constant.
    * {{{
    *   // The following selects the sum of a person's height and weight.
    *   people.select( people('height) + 2 )
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def +(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
    typed(self.untyped.plus(u))

  /**
    * Inversion of boolean expression, i.e. NOT.
    * {{{
    *   // Select rows that are not active (isActive === false)
    *   df.filter( !df('isActive) )
    * }}}
    *
    * apache/spark
    */
  def unary_!(implicit i0: U <:< Boolean): ThisType[T, Boolean] =
    typed(!untyped)

  /** Unary minus, i.e. negate the expression.
    * {{{
    *   // Select the amount column and negates all values.
    *   df.select( -df('amount) )
    * }}}
    *
    * apache/spark
    */
  def unary_-(implicit n: CatalystNumeric[U]): ThisType[T, U] =
    typed(-self.untyped)

  /** Subtraction. Subtract the other expression from this expression.
    * {{{
    *   // The following selects the difference between people's height and their weight.
    *   people.select( people.col('height) minus people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def minus[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    typed(self.untyped.minus(other.untyped))

  /** Subtraction. Subtract the other expression from this expression.
    * {{{
    *   // The following selects the difference between people's height and their weight.
    *   people.select( people.col('height) - people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def -[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    minus(other)

  /** Subtraction. Subtract the other expression from this expression.
    * {{{
    *   // The following selects the difference between people's height and their weight.
    *   people.select( people('height) - 1 )
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def -(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
    typed(self.untyped.minus(u))

  /** Multiplication of this expression and another expression.
    * {{{
    *   // The following multiplies a person's height by their weight.
    *   people.select( people.col('height) multiply people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def multiply[TT, W]
    (other: ThisType[TT, U])
    (implicit
      n: CatalystNumeric[U],
      w: With.Aux[T, TT, W],
      t: ClassTag[U]
    ): ThisType[W, U] = typed {
      if (t.runtimeClass == BigDecimal(0).getClass) {
        // That's apparently the only way to get sound multiplication.
        // See https://issues.apache.org/jira/browse/SPARK-22036
        val dt = DecimalType(20, 14)
        self.untyped.cast(dt).multiply(other.untyped.cast(dt))
      } else {
        self.untyped.multiply(other.untyped)
      }
    }

  /** Multiplication of this expression and another expression.
    * {{{
    *   // The following multiplies a person's height by their weight.
    *   people.select( people.col('height) * people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def *[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W], t: ClassTag[U]): ThisType[W, U] =
    multiply(other)

  /** Multiplication of this expression a constant.
    * {{{
    *   // The following multiplies a person's height by their weight.
    *   people.select( people.col('height) * people.col('weight) )
    * }}}
    *
    * apache/spark
    */
  def *(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
    typed(self.untyped.multiply(u))

  /** Modulo (a.k.a. remainder) expression.
    *
    * apache/spark
    */
  def mod[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, Out] =
    typed(self.untyped.mod(other.untyped))

  /** Modulo (a.k.a. remainder) expression.
    *
    * apache/spark
    */
  def %[TT, W](other: ThisType[TT, U])(implicit n: CatalystNumeric[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    mod(other)

  /** Modulo (a.k.a. remainder) expression.
    *
    * apache/spark
    */
  def %(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, U] =
    typed(self.untyped.mod(u))

  /** Division this expression by another expression.
    * {{{
    *   // The following divides a person's height by their weight.
    *   people.select( people('height) / people('weight) )
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def divide[Out: TypedEncoder, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], w: With.Aux[T, TT, W]): ThisType[W, Out] =
    typed(self.untyped.divide(other.untyped))

  /** Division this expression by another expression.
    * {{{
    *   // The following divides a person's height by their weight.
    *   people.select( people('height) / people('weight) )
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def /[Out, TT, W](other: ThisType[TT, U])(implicit n: CatalystDivisible[U, Out], e: TypedEncoder[Out], w: With.Aux[T, TT, W]): ThisType[W, Out] =
    divide(other)

  /** Division this expression by another expression.
    * {{{
    *   // The following divides a person's height by their weight.
    *   people.select( people('height) / 2 )
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def /(u: U)(implicit n: CatalystNumeric[U]): ThisType[T, Double] =
    typed(self.untyped.divide(u))

  /** Returns a descending ordering used in sorting
    *
    * apache/spark
    */
  def desc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] =
    new SortedTypedColumn[T, U](untyped.desc)

  /** Returns an ascending ordering used in sorting
    *
    * apache/spark
    */
  def asc(implicit catalystOrdered: CatalystOrdered[U]): SortedTypedColumn[T, U] =
    new SortedTypedColumn[T, U](untyped.asc)

  /** Bitwise AND this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseAND (df.col('colB)))
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def bitwiseAND(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    typed(self.untyped.bitwiseAND(u))

  /** Bitwise AND this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseAND (df.col('colB)))
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def bitwiseAND[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    typed(self.untyped.bitwiseAND(other.untyped))

  /** Bitwise AND this expression and another expression (of same type).
    * {{{
    *   df.select(df.col('colA).cast[Int] & -1)
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def &(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    bitwiseAND(u)

  /** Bitwise AND this expression and another expression.
    * {{{
    *   df.select(df.col('colA) & (df.col('colB)))
    * }}}
    *
    * @param other a constant of the same type
    * apache/spark
    */
  def &[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    bitwiseAND(other)

  /** Bitwise OR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseOR (df.col('colB)))
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def bitwiseOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    typed(self.untyped.bitwiseOR(u))

  /** Bitwise OR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseOR (df.col('colB)))
    * }}}
    *
    * @param other a constant of the same type
    * apache/spark
    */
  def bitwiseOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    typed(self.untyped.bitwiseOR(other.untyped))

  /** Bitwise OR this expression and another expression (of same type).
    * {{{
    *   df.select(df.col('colA).cast[Long] | 1L)
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def |(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    bitwiseOR(u)

  /** Bitwise OR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) | (df.col('colB)))
    * }}}
    *
    * @param other a constant of the same type
    * apache/spark
    */
  def |[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    bitwiseOR(other)

  /** Bitwise XOR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseXOR (df.col('colB)))
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def bitwiseXOR(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    typed(self.untyped.bitwiseXOR(u))

  /** Bitwise XOR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) bitwiseXOR (df.col('colB)))
    * }}}
    *
    * @param other a constant of the same type
    * apache/spark
    */
  def bitwiseXOR[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    typed(self.untyped.bitwiseXOR(other.untyped))

  /** Bitwise XOR this expression and another expression (of same type).
    * {{{
    *   df.select(df.col('colA).cast[Long] ^ 1L)
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def ^(u: U)(implicit n: CatalystBitwise[U]): ThisType[T, U] =
    bitwiseXOR(u)

  /** Bitwise XOR this expression and another expression.
    * {{{
    *   df.select(df.col('colA) ^ (df.col('colB)))
    * }}}
    *
    * @param other a constant of the same type
    * apache/spark
    */
  def ^[TT, W](other: ThisType[TT, U])(implicit n: CatalystBitwise[U], w: With.Aux[T, TT, W]): ThisType[W, U] =
    bitwiseXOR(other)

  /** Casts the column to a different type.
    * {{{
    *   df.select(df('a).cast[Int])
    * }}}
    */
  def cast[A: TypedEncoder](implicit c: CatalystCast[U, A]): ThisType[T, A] =
    typed(self.untyped.cast(TypedEncoder[A].catalystRepr))

  /**
    * An expression that returns a substring
    * {{{
    *   df.select(df('a).substr(0, 5))
    * }}}
    *
    * @param startPos starting position
    * @param len length of the substring
    */
  def substr(startPos: Int, len: Int)(implicit ev: U =:= String): ThisType[T, String] =
    typed(self.untyped.substr(startPos, len))

  /**
    * An expression that returns a substring
    * {{{
    *   df.select(df('a).substr(df('b), df('c)))
    * }}}
    *
    * @param startPos expression for the starting position
    * @param len expression for the length of the substring
    */
  def substr[TT1, TT2, W1, W2](startPos: ThisType[TT1, Int], len: ThisType[TT2, Int])
                   (implicit
                    ev: U =:= String,
                    w1: With.Aux[T, TT1, W1],
                    w2: With.Aux[W1, TT2, W2]): ThisType[W2, String] =
    typed(self.untyped.substr(startPos.untyped, len.untyped))

  /** SQL like expression. Returns a boolean column based on a SQL LIKE match.
    * {{{
    *   val ds = TypedDataset.create(X2("foo", "bar") :: Nil)
    *   // true
    *   ds.select(ds('a).like("foo"))
    *
    *   // Selected column has value "bar"
    *   ds.select(when(ds('a).like("f"), ds('a)).otherwise(ds('b))
    * }}}
    * apache/spark
    */
  def like(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
    typed(self.untyped.like(literal))

  /** SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match.
    * {{{
    *   val ds = TypedDataset.create(X1("foo") :: Nil)
    *   // true
    *   ds.select(ds('a).rlike("foo"))
    *
    *   // true
    *   ds.select(ds('a).rlike(".*))
    * }}}
    * apache/spark
    */
  def rlike(literal: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
    typed(self.untyped.rlike(literal))

  /** String contains another string literal.
    * {{{
    *   df.filter ( df.col('a).contains("foo") )
    * }}}
    *
    * @param other a string that is being tested against.
    * apache/spark
    */
  def contains(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
    typed(self.untyped.contains(other))

  /** String contains.
    * {{{
    *   df.filter ( df.col('a).contains(df.col('b) )
    * }}}
    *
    * @param other a column which values is used as a string that is being tested against.
    * apache/spark
    */
  def contains[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped.contains(other.untyped))

  /** String starts with another string literal.
    * {{{
    *   df.filter ( df.col('a).startsWith("foo")
    * }}}
    *
    * @param other a prefix that is being tested against.
    * apache/spark
    */
  def startsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
    typed(self.untyped.startsWith(other))

  /** String starts with.
    * {{{
    *   df.filter ( df.col('a).startsWith(df.col('b))
    * }}}
    *
    * @param other a column which values is used as a prefix that is being tested against.
    * apache/spark
    */
  def startsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped.startsWith(other.untyped))

  /** String ends with another string literal.
    * {{{
    *   df.filter ( df.col('a).endsWith("foo")
    * }}}
    *
    * @param other a suffix that is being tested against.
    * apache/spark
    */
  def endsWith(other: String)(implicit ev: U =:= String): ThisType[T, Boolean] =
    typed(self.untyped.endsWith(other))

  /** String ends with.
    * {{{
    *   df.filter ( df.col('a).endsWith(df.col('b))
    * }}}
    *
    * @param other a column which values is used as a suffix that is being tested against.
    * apache/spark
    */
  def endsWith[TT, W](other: ThisType[TT, U])(implicit ev: U =:= String, w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped.endsWith(other.untyped))

  /** Boolean AND.
    * {{{
    *   df.filter ( (df.col('a) === 1).and(df.col('b) > 5) )
    * }}}
    */
  def and[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped.and(other.untyped))

  /** Boolean AND.
    * {{{
    *   df.filter ( df.col('a) === 1 && df.col('b) > 5)
    * }}}
    */
  def && [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    and(other)

  /** Boolean OR.
    * {{{
    *   df.filter ( (df.col('a) === 1).or(df.col('b) > 5) )
    * }}}
    */
  def or[TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped.or(other.untyped))

  /** Boolean OR.
    * {{{
    *   df.filter ( df.col('a) === 1 || df.col('b) > 5)
    * }}}
    */
  def || [TT, W](other: ThisType[TT, Boolean])(implicit w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    or(other)

  /** Less than.
    * 
    * {{{
    * // The following selects people younger than the maxAge column.
    * df.select(df('age) < df('maxAge) )
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def <[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped < other.untyped)

  /** Less than or equal to.
    * 
    * {{{
    * // The following selects people younger or equal than the maxAge column.
    * df.select(df('age) <= df('maxAge)
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def <=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped <= other.untyped)

  /** Greater than.
    * {{{
    *   // The following selects people older than the maxAge column.
    *   df.select( df('age) > df('maxAge) )
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def >[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped > other.untyped)

  /** Greater than or equal.
    * {{{
    *   // The following selects people older or equal than the maxAge column.
    *   df.select( df('age) >= df('maxAge) )
    * }}}
    *
    * @param other another column of the same type
    * apache/spark
    */
  def >=[TT, W](other: ThisType[TT, U])(implicit i0: CatalystOrdered[U], w: With.Aux[T, TT, W]): ThisType[W, Boolean] =
    typed(self.untyped >= other.untyped)

  /** Less than.
    * {{{
    *   // The following selects people younger than 21.
    *   df.select( df('age) < 21 )
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def <(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
    typed(self.untyped < lit(u)(self.uencoder).untyped)

  /** Less than or equal to.
    * {{{
    *   // The following selects people younger than 22.
    *   df.select( df('age) <= 2 )
    * }}}
    *
    * @param u a constant of the same type
    * apache/spark
    */
  def <=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
    typed(self.untyped <= lit(u)(self.uencoder).untyped)

  /** Greater than.
    * {{{
    *   // The following selects people older than 21.
    *   df.select( df('age) > 21 )
    * }}}
    *
    * @param u another column of the same type
    * apache/spark
    */
  def >(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
    typed(self.untyped > lit(u)(self.uencoder).untyped)

  /** Greater than or equal.
    * {{{
    *   // The following selects people older than 20.
    *   df.select( df('age) >= 21 )
    * }}}
    *
    * @param u another column of the same type
    * apache/spark
    */
  def >=(u: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
    typed(self.untyped >= lit(u)(self.uencoder).untyped)

  /**
    * Returns true if the value of this column is contained in of the arguments.
    * {{{
    *   // The following selects people with age 15, 20, or 30.
    *   df.select( df('age).isin(15, 20, 30) )
    * }}}
    *
    * @param values are constants of the same type
    * apache/spark
    */
  def isin(values: U*)(implicit e: CatalystIsin[U]): ThisType[T, Boolean] =
    typed(self.untyped.isin(values:_*))

  /**
    * True if the current column is between the lower bound and upper bound, inclusive.
    *
    * @param lowerBound a constant of the same type
    * @param upperBound a constant of the same type
    * apache/spark
    */
  def between(lowerBound: U, upperBound: U)(implicit i0: CatalystOrdered[U]): ThisType[T, Boolean] =
    typed(self.untyped.between(lit(lowerBound)(self.uencoder).untyped, lit(upperBound)(self.uencoder).untyped))

  /**
    * True if the current column is between the lower bound and upper bound, inclusive.
    *
    * @param lowerBound another column of the same type
    * @param upperBound another column of the same type
    * apache/spark
    */
  def between[TT1, TT2, W1, W2](lowerBound: ThisType[TT1, U], upperBound: ThisType[TT2, U])
    (implicit
      i0: CatalystOrdered[U],
      w0: With.Aux[T, TT1, W1],
      w1: With.Aux[TT2, W1, W2]
    ): ThisType[W2, Boolean] =
      typed(self.untyped.between(lowerBound.untyped, upperBound.untyped))

  /**
    * Returns a nested column matching the field `symbol`.
    * 
    * @param symbol the field symbol
    * @tparam V the type of the nested field
    */
  def field[V](symbol: Witness.Lt[Symbol])(implicit
      i0: TypedColumn.Exists[U, symbol.T, V],
      i1: TypedEncoder[V]
    ): ThisType[T, V] = 
    typed(self.untyped.getField(symbol.value.name))

}


sealed class SortedTypedColumn[T, U](val expr: Expression)(
  implicit
  val uencoder: TypedEncoder[U]
) extends UntypedExpression[T] {

  def this(column: Column)(implicit e: TypedEncoder[U]) = {
    this(FramelessInternals.expr(column))
  }

  def untyped: Column = new Column(expr)
}

object SortedTypedColumn {
  implicit def defaultAscending[T, U : CatalystOrdered](typedColumn: TypedColumn[T, U]): SortedTypedColumn[T, U] =
    new SortedTypedColumn[T, U](typedColumn.untyped.asc)(typedColumn.uencoder)

    object defaultAscendingPoly extends Poly1 {
      implicit def caseTypedColumn[T, U : CatalystOrdered] = at[TypedColumn[T, U]](c => defaultAscending(c))
      implicit def caseTypeSortedColumn[T, U] = at[SortedTypedColumn[T, U]](identity)
    }
}

object TypedColumn {
  /** Evidence that type `T` has column `K` with type `V`. */
  @implicitNotFound(msg = "No column ${K} of type ${V} in ${T}")
  trait Exists[T, K, V]

  @implicitNotFound(msg = "No columns ${K} of type ${V} in ${T}")
  trait ExistsMany[T, K <: HList, V]

  object ExistsMany {
    implicit def deriveCons[T, KH, KT <: HList, V0, V1]
      (implicit
        head: Exists[T, KH, V0],
        tail: ExistsMany[V0, KT, V1]
      ): ExistsMany[T, KH :: KT, V1] =
        new ExistsMany[T, KH :: KT, V1] {}

    implicit def deriveHNil[T, K, V](implicit head: Exists[T, K, V]): ExistsMany[T, K :: HNil, V] =
      new ExistsMany[T, K :: HNil, V] {}
  }

  object Exists {
    def apply[T, V](column: Witness)(implicit e: Exists[T, column.T, V]): Exists[T, column.T, V] = e

    implicit def deriveRecord[T, H <: HList, K, V]
      (implicit
        i0: LabelledGeneric.Aux[T, H],
        i1: Selector.Aux[H, K, V]
      ): Exists[T, K, V] = new Exists[T, K, V] {}
  }

  /**
    * {{{
    * import frameless.TypedColumn
    * 
    * case class Foo(id: Int, bar: String)
    * 
    * val colbar: TypedColumn[Foo, String] = TypedColumn { foo: Foo => foo.bar }
    * val colid = TypedColumn[Foo, Int](_.id)
    * }}}
    */
  def apply[T, U](x: T => U): TypedColumn[T, U] =
    macro TypedColumnMacroImpl.applyImpl[T, U]

}


================================================
FILE: dataset/src/main/scala/frameless/TypedColumnMacroImpl.scala
================================================
package frameless

import scala.reflect.macros.whitebox

private[frameless] object TypedColumnMacroImpl {

  def applyImpl[T: c.WeakTypeTag, U: c.WeakTypeTag](c: whitebox.Context)(x: c.Tree): c.Expr[TypedColumn[T, U]] = {
    import c.universe._

    val t = c.weakTypeOf[T]
    val u = c.weakTypeOf[U]

    def buildExpression(path: List[String]): c.Expr[TypedColumn[T, U]] = {
      val columnName = path.mkString(".")

      c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($columnName)).expr)")
    }

    def abort(msg: String) = c.abort(c.enclosingPosition, msg)

    @annotation.tailrec
    def path(in: Select, out: List[TermName]): List[TermName] =
      in.qualifier match {
        case sub: Select =>
          path(sub, in.name.toTermName :: out)

        case id: Ident =>
          id.name.toTermName :: in.name.toTermName :: out

        case u =>
          abort(s"Unsupported selection: $u")
      }

    @annotation.tailrec
    def check(current: Type, in: List[TermName]): Boolean = in match {
      case next :: tail => {
        val sym = current.decl(next).asTerm

        if (!sym.isStable) {
          abort(s"Stable term expected: ${current}.${next}")
        }

        check(sym.info, tail)
      }

      case _ =>
        true
    }

    x match {
      case fn: Function => fn.body match {
        case select: Select if select.name.isTermName =>
          val expectedRoot: Option[String] = fn.vparams match {
            case List(rt) if rt.rhs == EmptyTree =>
              Option.empty[String]

            case List(rt) =>
              Some(rt.toString)

            case u =>
              abort(s"Select expression must have a single parameter: ${u mkString ", "}")
          }

          path(select, List.empty) match {
            case root :: tail if (
              expectedRoot.forall(_ == root) && check(t, tail)) => {
              val colPath = tail.mkString(".")

              c.Expr[TypedColumn[T, U]](q"new _root_.frameless.TypedColumn[$t, $u]((org.apache.spark.sql.functions.col($colPath)).expr)")
            }

            case _ =>
              abort(s"Invalid select expression: $select")
          }

        case t =>
          abort(s"Select expression expected: $t")
      }

      case _ =>
        abort(s"Function expected: $x")
    }
  }
}


================================================
FILE: dataset/src/main/scala/frameless/TypedDataset.scala
================================================
package frameless

import java.util
import frameless.functions.CatalystExplodableCollection
import frameless.ops._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Dataset, FramelessInternals, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint}
import org.apache.spark.sql.catalyst.plans.Inner
import org.apache.spark.sql.types.StructType
import shapeless._
import shapeless.labelled.FieldType
import shapeless.ops.hlist.{Diff, IsHCons, Mapper, Prepend, ToTraversable, Tupler}
import shapeless.ops.record.{Keys, Modifier, Remover, Values}

import scala.language.experimental.macros

/** [[TypedDataset]] is a safer interface for working with `Dataset`.
  *
  * NOTE: Prefer `TypedDataset.create` over `new TypedDataset` unless you
  * know what you are doing.
  *
  * Documentation marked "apache/spark" is thanks to apache/spark Contributors
  * at https://github.com/apache/spark, licensed under Apache v2.0 available at
  * http://www.apache.org/licenses/LICENSE-2.0
  */
class TypedDataset[T] protected[frameless](val dataset: Dataset[T])(implicit val encoder: TypedEncoder[T])
    extends TypedDatasetForwarded[T] { self =>

  private implicit val spark: SparkSession = dataset.sparkSession

  /** Aggregates on the entire Dataset without groups.
    *
    * apache/spark
    */
  def agg[A](ca: TypedAggregate[T, A]): TypedDataset[A] = {
    implicit val ea = ca.uencoder
    val tuple1: TypedDataset[Tuple1[A]] = aggMany(ca)

    // now we need to unpack `Tuple1[A]` to `A`
    TypedEncoder[A].catalystRepr match {
      case StructType(_) =>
        // if column is struct, we use all its fields
        val df = tuple1
          .dataset
          .selectExpr("_1.*")
          .as[A](TypedExpressionEncoder[A])

        TypedDataset.create(df)
      case other =>
        // for primitive types `Tuple1[A]` has the same schema as `A`
        TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A]))
    }
  }

  /** Aggregates on the entire Dataset without groups.
    *
    * apache/spark
    */
  def agg[A, B](
    ca: TypedAggregate[T, A],
    cb: TypedAggregate[T, B]
  ): TypedDataset[(A, B)] = {
    implicit val (ea, eb) = (ca.uencoder, cb.uencoder)
    aggMany(ca, cb)
  }

  /** Aggregates on the entire Dataset without groups.
    *
    * apache/spark
    */
  def agg[A, B, C](
    ca: TypedAggregate[T, A],
    cb: TypedAggregate[T, B],
    cc: TypedAggregate[T, C]
  ): TypedDataset[(A, B, C)] = {
    implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder)
    aggMany(ca, cb, cc)
  }

  /** Aggregates on the entire Dataset without groups.
    *
    * apache/spark
    */
  def agg[A, B, C, D](
    ca: TypedAggregate[T, A],
    cb: TypedAggregate[T, B],
    cc: TypedAggregate[T, C],
    cd: TypedAggregate[T, D]
  ): TypedDataset[(A, B, C, D)] = {
    implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder)
    aggMany(ca, cb, cc, cd)
  }

  /** Aggregates on the entire Dataset without groups.
    *
    * apache/spark
    */
  object aggMany extends ProductArgs {
    def applyProduct[U <: HList, Out0 <: HList, Out](columns: U)
      (implicit
        i0: AggregateTypes.Aux[T, U, Out0],
        i1: ToTraversable.Aux[U, List, UntypedExpression[T]],
        i2: Tupler.Aux[Out0, Out],
        i3: TypedEncoder[Out]
      ): TypedDataset[Out] = {

      val underlyingColumns = columns.toList[UntypedExpression[T]]
      val cols: Seq[Column] = for {
        (c, i) <- columns.toList[UntypedExpression[T]].zipWithIndex
      } yield new Column(c.expr).as(s"_${i+1}")

      // Workaround to SPARK-20346. One alternative is to allow the result to be Vector(null) for empty DataFrames.
      // Another one would be to return an Option.
      val filterStr = (
        for {
          (c, i) <- underlyingColumns.zipWithIndex
          if !c.uencoder.nullable
        } yield s"_${i+1} is not null"
        ).mkString(" or ")

      val selected = dataset.toDF().agg(cols.head, cols.tail:_*).as[Out](TypedExpressionEncoder[Out])
      TypedDataset.create[Out](if (filterStr.isEmpty) selected else selected.filter(filterStr))
    }
  }

  /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type. */
  def as[U]()(implicit as: As[T, U]): TypedDataset[U] = {
    implicit val uencoder = as.encoder
    TypedDataset.create(dataset.as[U](TypedExpressionEncoder[U]))
  }

  /** Returns a checkpointed version of this [[TypedDataset]]. Checkpointing can be used to truncate the
    * logical plan of this Dataset, which is especially useful in iterative algorithms where the
    * plan may grow exponentially. It will be saved to files inside the checkpoint
    * directory set with `SparkContext#setCheckpointDir`.
    *
    * Differs from `Dataset#checkpoint` by wrapping its result into an effect-suspending `F[_]`.
    *
    * apache/spark
    */
  def checkpoint[F[_]](eager: Boolean)(implicit F: SparkDelay[F]): F[TypedDataset[T]] =
    F.delay(TypedDataset.create[T](dataset.checkpoint(eager)))

  /** Returns a new [[TypedDataset]] where each record has been mapped on to the specified type.
    * Unlike `as` the projection U may include a subset of the columns of T and the column names and types must agree.
    *
    * {{{
    *   case class Foo(i: Int, j: String)
    *   case class Bar(j: String)
    *
    *   val t: TypedDataset[Foo] = ...
    *   val b: TypedDataset[Bar] = t.project[Bar]
    *
    *   case class BarErr(e: String)
    *   // The following does not compile because `Foo` doesn't have a field with name `e`
    *   val e: TypedDataset[BarErr] = t.project[BarErr]
    * }}}
    */
  def project[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = projector.apply(this)

  /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]]
    * combined.
    *
    * Note that, this function is not a typical set union operation, in that it does not eliminate
    * duplicate items. As such, it is analogous to `UNION ALL` in SQL.
    *
    * Differs from `Dataset#union` by aligning fields if possible.
    * It will not compile if `Datasets` have not compatible schema.
    *
    * Example:
    * {{{
    *   case class Foo(x: Int, y: Long)
    *   case class Bar(y: Long, x: Int)
    *   case class Faz(x: Int, y: Int, z: Int)
    *
    *   foo: TypedDataset[Foo] = ...
    *   bar: TypedDataset[Bar] = ...
    *   faz: TypedDataset[Faz] = ...
    *
    *   foo union bar: TypedDataset[Foo]
    *   foo union faz: TypedDataset[Foo]
    *   // won't compile, you need to reverse order, you can't project from less fields to more
    *   faz union foo
    *
    * }}}
    *
    * apache/spark
    */
  def union[U: TypedEncoder](other: TypedDataset[U])(implicit projector: SmartProject[U, T]): TypedDataset[T] =
    TypedDataset.create(dataset.union(other.project[T].dataset))

  /** Returns a new [[TypedDataset]] that contains the elements of both this and the `other` [[TypedDataset]]
    * combined.
    *
    * Note that, this function is not a typical set union operation, in that it does not eliminate
    * duplicate items. As such, it is analogous to `UNION ALL` in SQL.
    *
    * apache/spark
    */
  def union(other: TypedDataset[T]): TypedDataset[T] = {
    TypedDataset.create(dataset.union(other.dataset))
  }

  /** Returns the number of elements in the [[TypedDataset]].
    *
    * Differs from `Dataset#count` by wrapping its result into an effect-suspending `F[_]`.
    */
  def count[F[_]]()(implicit F: SparkDelay[F]): F[Long] =
    F.delay(dataset.count())

  /** Returns `TypedColumn` of type `A` given its name (alias for `col`).
    *
    * {{{
    * tf('id)
    * }}}
    *
    * It is statically checked that column with such name exists and has type `A`.
    */
  def apply[A](column: Witness.Lt[Symbol])
    (implicit
      i0: TypedColumn.Exists[T, column.T, A],
      i1: TypedEncoder[A]
    ): TypedColumn[T, A] = col(column)

  /** Returns `TypedColumn` of type `A` given its name.
    *
    * {{{
    * tf.col('id)
    * }}}
    *
    * It is statically checked that column with such name exists and has type `A`.
    */
  def col[A](column: Witness.Lt[Symbol])
    (implicit
      i0: TypedColumn.Exists[T, column.T, A],
      i1: TypedEncoder[A]
    ): TypedColumn[T, A] =
      new TypedColumn[T, A](dataset(column.value.name).as[A](TypedExpressionEncoder[A]))

  /** Returns `TypedColumn` of type `A` given a lambda indicating the field.
   *
   * {{{
   *   td.col(_.id)
   * }}}
   *
   * It is statically checked that column with such name exists and has type `A`.
   */
  def col[A](x: Function1[T, A]): TypedColumn[T, A] =
    macro TypedColumnMacroImpl.applyImpl[T, A]

  /** Projects the entire `TypedDataset[T]` into a single column of type `TypedColumn[T,T]`.
    * {{{
    *   ts: TypedDataset[Foo] = ...
    *   ts.select(ts.asCol, ts.asCol): TypedDataset[(Foo,Foo)]
    * }}}
    */
  def asCol: TypedColumn[T, T] = {
    val projectedColumn: Column = encoder.catalystRepr match {
      case StructType(_) =>
        val allColumns: Array[Column] = dataset.columns.map(dataset.col)
        org.apache.spark.sql.functions.struct(allColumns.toSeq: _*)

      case _ =>
        dataset.col(dataset.columns.head)
    }
    
    new TypedColumn[T,T](projectedColumn)
  }

  /** References the entire `TypedDataset[T]` as a single column 
    * of type `TypedColumn[T,T]` so it can be used in a join operation.
    * 
    * {{{
    * def nameJoin(ds1: TypedDataset[Person], ds2: TypedDataset[Name]) =
    *   ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue)
    * }}}
    */
  def asJoinColValue(implicit i0: IsValueClass[T]): TypedColumn[T, T] = {
    import _root_.frameless.syntax._

    dataset.col("value").typedColumn
  }

  object colMany extends SingletonProductArgs {
    def applyProduct[U <: HList, Out](columns: U)
      (implicit
        i0: TypedColumn.ExistsMany[T, U, Out],
        i1: TypedEncoder[Out],
        i2: ToTraversable.Aux[U, List, Symbol]
      ): TypedColumn[T, Out] = {
        val names = columns.toList[Symbol].map(_.name)
        val colExpr = FramelessInternals.resolveExpr(dataset, names)
        new TypedColumn[T, Out](colExpr)
      }
  }

  /** Right hand side disambiguation of `col` for join expressions.
    * To be used  when writting self-joins, noop in other circumstances.
    *
    * Note: In vanilla Spark, disambiguation in self-joins is acheaved using
    * String based aliases, which is obviously unsafe.
    */
  def colRight[A](column: Witness.Lt[Symbol])
    (implicit
      i0: TypedColumn.Exists[T, column.T, A],
      i1: TypedEncoder[A]
    ): TypedColumn[T, A] =
      new TypedColumn[T, A](FramelessInternals.DisambiguateRight(col(column).expr))

  /** Left hand side disambiguation of `col` for join expressions.
    * To be used  when writting self-joins, noop in other circumstances.
    *
    * Note: In vanilla Spark, disambiguation in self-joins is acheaved using
    * String based aliases, which is obviously unsafe.
    */
  def colLeft[A](column: Witness.Lt[Symbol])
    (implicit
      i0: TypedColumn.Exists[T, column.T, A],
      i1: TypedEncoder[A]
    ): TypedColumn[T, A] =
      new TypedColumn[T, A](FramelessInternals.DisambiguateLeft(col(column).expr))

  /** Returns a `Seq` that contains all the elements in this [[TypedDataset]].
    *
    * Running this operation requires moving all the data into the application's driver process, and
    * doing so on a very large [[TypedDataset]] can crash the driver process with OutOfMemoryError.
    *
    * Differs from `Dataset#collect` by wrapping its result into an effect-suspending `F[_]`.
    */
  def collect[F[_]]()(implicit F: SparkDelay[F]): F[Seq[T]] =
    F.delay(dataset.collect().toSeq)

  /** Optionally returns the first element in this [[TypedDataset]].
    *
    * Differs from `Dataset#first` by wrapping its result into an `Option` and an effect-suspending `F[_]`.
    */
  def firstOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] =
    F.delay {
      try {
        Option(dataset.first())
      } catch {
        case e: NoSuchElementException => None
      }
    }

  /** Returns the first `num` elements of this [[TypedDataset]] as a `Seq`.
    *
    * Running take requires moving data into the application's driver process, and doing so with
    * a very large `num` can crash the driver process with OutOfMemoryError.
    *
    * Differs from `Dataset#take` by wrapping its result into an effect-suspending `F[_]`.
    *
    * apache/spark
    */
  def take[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] =
    F.delay(dataset.take(num).toSeq)

  /** Return an iterator that contains all rows in this [[TypedDataset]].
    *
    * The iterator will consume as much memory as the largest partition in this [[TypedDataset]].
    *
    * NOTE: this results in multiple Spark jobs, and if the input [[TypedDataset]] is the result
    * of a wide transformation (e.g. join with different partitioners), to avoid
    * recomputing the input [[TypedDataset]] should be cached first.
    *
    * Differs from `Dataset#toLocalIterator()` by wrapping its result into an effect-suspending `F[_]`.
    *
    * apache/spark
    */
  def toLocalIterator[F[_]]()(implicit F: SparkDelay[F]): F[util.Iterator[T]] =
    F.delay(dataset.toLocalIterator())

  /** Alias for firstOption().
    */
  def headOption[F[_]]()(implicit F: SparkDelay[F]): F[Option[T]] = firstOption()

  /** Alias for take().
    */
  def head[F[_]](num: Int)(implicit F: SparkDelay[F]): F[Seq[T]] = take(num)

  // $COVERAGE-OFF$
  /** Alias for firstOption().
    */
  @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0")
  def head: T = dataset.head()

  /** Alias for firstOption().
    */
  @deprecated("Method may throw exception. Use headOption or firstOption instead.", "0.5.0")
  def first: T = dataset.head()
  // $COVERAGE-ONN$

  /** Displays the content of this [[TypedDataset]] in a tabular form. Strings more than 20 characters
    * will be truncated, and all cells will be aligned right. For example:
    * {{{
    *   year  month AVG('Adj Close) MAX('Adj Close)
    *   1980  12    0.503218        0.595103
    *   1981  01    0.523289        0.570307
    *   1982  02    0.436504        0.475256
    *   1983  03    0.410516        0.442194
    *   1984  04    0.450090        0.483521
    * }}}
    * @param numRows Number of rows to show
    * @param truncate Whether truncate long strings. If true, strings more than 20 characters will
    *   be truncated and all cells will be aligned right
    *
    * Differs from `Dataset#show` by wrapping its result into an effect-suspending `F[_]`.
    *
    * apache/spark
    */
  def show[F[_]](numRows: Int = 20, truncate: Boolean = true)(implicit F: SparkDelay[F]): F[Unit] =
    F.delay(dataset.show(numRows, truncate))

  /** Returns a new [[frameless.TypedDataset]] that only contains elements where `column` is `true`.
    *
    * Differs from `TypedDatasetForward#filter` by taking a `TypedColumn[T, Boolean]` instead of a
    * `T => Boolean`. Using a column expression instead of a regular function save one Spark → Scala
    * deserialization which leads to better performance.
    */
  def filter(column: TypedColumn[T, Boolean]): TypedDataset[T] = {
    val filtered = dataset.toDF()
      .filter(column.untyped)
      .as[T](TypedExpressionEncoder[T])

    TypedDataset.create[T](filtered)
  }

  /** Runs `func` on each element of this [[TypedDataset]].
    *
    * Differs from `Dataset#foreach` by wrapping its result into an effect-suspending `F[_]`.
    */
  def foreach[F[_]](func: T => Unit)(implicit F: SparkDelay[F]): F[Unit] =
    F.delay(dataset.foreach(func))

  /** Runs `func` on each partition of this [[TypedDataset]].
    *
    * Differs from `Dataset#foreachPartition` by wrapping its result into an effect-suspending `F[_]`.
    */
  def foreachPartition[F[_]](func: Iterator[T] => Unit)(implicit F: SparkDelay[F]): F[Unit] =
    F.delay(dataset.foreachPartition(func))

  /**
    * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified column,
    * so we can run aggregation on it.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  def cube[K1](
    c1: TypedColumn[T, K1]
  ): Cube1Ops[K1, T] = new Cube1Ops[K1, T](this, c1)

  /**
    * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns,
    * so we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  def cube[K1, K2](
    c1: TypedColumn[T, K1],
    c2: TypedColumn[T, K2]
  ): Cube2Ops[K1, K2, T] = new Cube2Ops[K1, K2, T](this, c1, c2)

  /**
    * Create a multi-dimensional cube for the current [[TypedDataset]] using the specified columns,
    * so we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * {{{
    *   case class MyClass(a: Int, b: Int, c: Int)
    *   val ds: TypedDataset[MyClass]

    *   val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
    *     ds.cubeMany(ds('a), ds('b)).agg(count[MyClass]())
    *
    *   // original dataset:
    *     a       b     c
    *    10      20     1
    *    15      25     2
    *
    *   // after aggregation:
    *     _1      _2   _3
    *     15    null    1
    *     15      25    1
    *   null    null    2
    *   null      25    1
    *   null      20    1
    *     10    null    1
    *     10      20    1
    *
    * }}}
    *
    * Differs from `Dataset#cube` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  object cubeMany extends ProductArgs {
    def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
      (implicit
        i0: ColumnTypes.Aux[T, TK, K],
        i1: Tupler.Aux[K, KT],
        i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
      ): CubeManyOps[T, TK, K, KT] = new CubeManyOps[T, TK, K, KT](self, groupedBy)
  }

  /**
    * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * apache/spark
    */
  def groupBy[K1](
    c1: TypedColumn[T, K1]
  ): GroupedBy1Ops[K1, T] = new GroupedBy1Ops[K1, T](this, c1)

  /**
    * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * apache/spark
    */
  def groupBy[K1, K2](
    c1: TypedColumn[T, K1],
    c2: TypedColumn[T, K2]
  ): GroupedBy2Ops[K1, K2, T] = new GroupedBy2Ops[K1, K2, T](this, c1, c2)

  /**
    * Groups the [[TypedDataset]] using the specified columns, so that we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * {{{
    *   case class MyClass(a: Int, b: Int, c: Int)
    *   val ds: TypedDataset[MyClass]
    *
    *   val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
    *     ds.groupByMany(ds('a), ds('b)).agg(count[MyClass]())
    *
    *   // original dataset:
    *     a       b     c
    *    10      20     1
    *    15      25     2
    *
    *   // after aggregation:
    *     _1      _2   _3
    *     10      20    1
    *     15      25    1
    *
    * }}}
    *
    * apache/spark
    */
  object groupByMany extends ProductArgs {
    def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
      (implicit
        i0: ColumnTypes.Aux[T, TK, K],
        i1: Tupler.Aux[K, KT],
        i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
      ): GroupedByManyOps[T, TK, K, KT] = new GroupedByManyOps[T, TK, K, KT](self, groupedBy)
  }

  /**
    * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified column,
    * so we can run aggregation on it.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  def rollup[K1](
    c1: TypedColumn[T, K1]
  ): Rollup1Ops[K1, T] = new Rollup1Ops[K1, T](this, c1)

  /**
    * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns,
    * so we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  def rollup[K1, K2](
    c1: TypedColumn[T, K1],
    c2: TypedColumn[T, K2]
  ): Rollup2Ops[K1, K2, T] = new Rollup2Ops[K1, K2, T](this, c1, c2)

  /**
    * Create a multi-dimensional rollup for the current [[TypedDataset]] using the specified columns,
    * so we can run aggregation on them.
    * See [[frameless.functions.AggregateFunctions]] for all the available aggregate functions.
    *
    * {{{
    *   case class MyClass(a: Int, b: Int, c: Int)
    *   val ds: TypedDataset[MyClass]
    *
    *   val cubeDataset: TypedDataset[(Option[A], Option[B], Long)] =
    *     ds.rollupMany(ds('a), ds('b)).agg(count[MyClass]())
    *
    *   // original dataset:
    *     a       b     c
    *    10      20     1
    *    15      25     2
    *
    *   // after aggregation:
    *     _1      _2   _3
    *     15    null    1
    *     15      25    1
    *   null    null    2
    *     10    null    1
    *     10      20    1
    *
    * }}}
    *
    * Differs from `Dataset#rollup` by wrapping values into `Option` instead of returning `null`.
    *
    * apache/spark
    */
  object rollupMany extends ProductArgs {
    def applyProduct[TK <: HList, K <: HList, KT](groupedBy: TK)
      (implicit
        i0: ColumnTypes.Aux[T, TK, K],
        i1: Tupler.Aux[K, KT],
        i2: ToTraversable.Aux[TK, List, UntypedExpression[T]]
      ): RollupManyOps[T, TK, K, KT] = new RollupManyOps[T, TK, K, KT](self, groupedBy)
  }

  /** Computes the cartesian project of `this` `Dataset` with the `other` `Dataset` */
  def joinCross[U](other: TypedDataset[U])
    (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] =
      new TypedDataset(self.dataset.joinWith(other.dataset, new Column(Literal(true)), "cross"))

  /** Computes the full outer join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinFull[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
    (implicit e: TypedEncoder[(Option[T], Option[U])]): TypedDataset[(Option[T], Option[U])] =
    new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "full")
      .as[(Option[T], Option[U])](TypedExpressionEncoder[(Option[T], Option[U])]))

  /** Computes the inner join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinInner[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
    (implicit e: TypedEncoder[(T, U)]): TypedDataset[(T, U)] = {
      import FramelessInternals._

      val leftPlan = logicalPlan(dataset)
      val rightPlan = logicalPlan(other.dataset)
      val join = disambiguate(Join(leftPlan, rightPlan, Inner, Some(condition.expr), JoinHint.NONE))
      val joinedPlan = joinPlan(dataset, join, leftPlan, rightPlan)
      val joinedDs = mkDataset(dataset.sqlContext, joinedPlan, TypedExpressionEncoder[(T, U)])

      TypedDataset.create[(T, U)](joinedDs)
    }

  /** Computes the left outer join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinLeft[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
    (implicit e: TypedEncoder[(T, Option[U])]): TypedDataset[(T, Option[U])] =
      new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "left_outer")
        .as[(T, Option[U])](TypedExpressionEncoder[(T, Option[U])]))

  /** Computes the left semi join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinLeftSemi[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] =
    new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftsemi")
      .as[T](TypedExpressionEncoder(encoder)))

  /** Computes the left anti join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinLeftAnti[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean]): TypedDataset[T] =
    new TypedDataset(self.dataset.join(other.dataset, condition.untyped, "leftanti")
      .as[T](TypedExpressionEncoder(encoder)))

  /** Computes the right outer join of `this` `Dataset` with the `other` `Dataset`,
    * returning a `Tuple2` for each pair where condition evaluates to true.
    */
  def joinRight[U](other: TypedDataset[U])(condition: TypedColumn[T with U, Boolean])
    (implicit e: TypedEncoder[(Option[T], U)]): TypedDataset[(Option[T], U)] =
    new TypedDataset(self.dataset.joinWith(other.dataset, condition.untyped, "right_outer")
      .as[(Option[T], U)](TypedExpressionEncoder[(Option[T], U)]))

  private def disambiguate(join: Join): Join = {
    val plan = FramelessInternals.ofRows(dataset.sparkSession, join).queryExecution.analyzed.asInstanceOf[Join]
    val disambiguated = plan.condition.map(_.transform {
      case FramelessInternals.DisambiguateLeft(tagged: AttributeReference) =>
        val leftDs = FramelessInternals.ofRows(spark, plan.left)
        FramelessInternals.resolveExpr(leftDs, Seq(tagged.name))

      case FramelessInternals.DisambiguateRight(tagged: AttributeReference) =>
        val rightDs = FramelessInternals.ofRows(spark, plan.right)
        FramelessInternals.resolveExpr(rightDs, Seq(tagged.name))

      case x => x
    })
    plan.copy(condition = disambiguated)
  }

  /** Takes a function from A => R and converts it to a UDF for TypedColumn[T, A] => TypedColumn[T, R].
    */
  def makeUDF[A: TypedEncoder, R: TypedEncoder](f: A => R):
  TypedColumn[T, A] => TypedColumn[T, R] = functions.udf(f)

  /** Takes a function from (A1, A2) => R and converts it to a UDF for
    * (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R].
    */
  def makeUDF[A1: TypedEncoder, A2: TypedEncoder, R: TypedEncoder](f: (A1, A2) => R):
  (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = functions.udf(f)

  /** Takes a function from (A1, A2, A3) => R and converts it to a UDF for
    * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R].
    */
  def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3) => R):
  (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = functions.udf(f)

  /** Takes a function from (A1, A2, A3, A4) => R and converts it to a UDF for
    * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R].
    */
  def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4) => R):
  (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = functions.udf(f)

  /** Takes a function from (A1, A2, A3, A4, A5) => R and converts it to a UDF for
    * (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R].
    */
  def makeUDF[A1: TypedEncoder, A2: TypedEncoder, A3: TypedEncoder, A4: TypedEncoder, A5: TypedEncoder, R: TypedEncoder](f: (A1, A2, A3, A4, A5) => R):
  (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = functions.udf(f)

  /** Type-safe projection from type T to Tuple1[A]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A](
    ca: TypedColumn[T, A]
  ): TypedDataset[A] = {
    implicit val ea = ca.uencoder

    val tuple1: TypedDataset[Tuple1[A]] = selectMany(ca)

    // now we need to unpack `Tuple1[A]` to `A`

    TypedEncoder[A].catalystRepr match {
      case StructType(_) =>
        // if column is struct, we use all its fields
        val df = tuple1
          .dataset
          .selectExpr("_1.*")
          .as[A](TypedExpressionEncoder[A])

        TypedDataset.create(df)
      case other =>
        // for primitive types `Tuple1[A]` has the same schema as `A`
        TypedDataset.create(tuple1.dataset.as[A](TypedExpressionEncoder[A]))
    }
  }

  /** Type-safe projection from type T to Tuple2[A,B]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B]
  ): TypedDataset[(A, B)] = {
    implicit val (ea, eb) = (ca.uencoder, cb.uencoder)
    selectMany(ca, cb)
  }

  /** Type-safe projection from type T to Tuple3[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C]
  ): TypedDataset[(A, B, C)] = {
    implicit val (ea, eb, ec) = (ca.uencoder, cb.uencoder, cc.uencoder)
    selectMany(ca, cb, cc)
  }

  /** Type-safe projection from type T to Tuple4[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D]
  ): TypedDataset[(A, B, C, D)] = {
    implicit val (ea, eb, ec, ed) = (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder)
    selectMany(ca, cb, cc, cd)
  }

  /** Type-safe projection from type T to Tuple5[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E]
  ): TypedDataset[(A, B, C, D, E)] = {
    implicit val (ea, eb, ec, ed, ee) =
      (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder)

    selectMany(ca, cb, cc, cd, ce)
  }

  /** Type-safe projection from type T to Tuple6[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E, F](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E],
    cf: TypedColumn[T, F]
  ): TypedDataset[(A, B, C, D, E, F)] = {
    implicit val (ea, eb, ec, ed, ee, ef) =
      (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder)

    selectMany(ca, cb, cc, cd, ce, cf)
  }

  /** Type-safe projection from type T to Tuple7[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E, F, G](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E],
    cf: TypedColumn[T, F],
    cg: TypedColumn[T, G]
  ): TypedDataset[(A, B, C, D, E, F, G)] = {
    implicit val (ea, eb, ec, ed, ee, ef, eg) =
      (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder)

    selectMany(ca, cb, cc, cd, ce, cf, cg)
  }

  /** Type-safe projection from type T to Tuple8[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E, F, G, H](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E],
    cf: TypedColumn[T, F],
    cg: TypedColumn[T, G],
    ch: TypedColumn[T, H]
  ): TypedDataset[(A, B, C, D, E, F, G, H)] = {
    implicit val (ea, eb, ec, ed, ee, ef, eg, eh) =
      (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder)

    selectMany(ca, cb, cc, cd, ce, cf, cg, ch)
  }

  /** Type-safe projection from type T to Tuple9[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E, F, G, H, I](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E],
    cf: TypedColumn[T, F],
    cg: TypedColumn[T, G],
    ch: TypedColumn[T, H],
    ci: TypedColumn[T, I]
  ): TypedDataset[(A, B, C, D, E, F, G, H, I)] = {
    implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei) =
       (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder)

    selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci)
  }

  /** Type-safe projection from type T to Tuple10[A,B,...]
    * {{{
    *   d.select( d('a), d('a)+d('b), ... )
    * }}}
    */
  def select[A, B, C, D, E, F, G, H, I, J](
    ca: TypedColumn[T, A],
    cb: TypedColumn[T, B],
    cc: TypedColumn[T, C],
    cd: TypedColumn[T, D],
    ce: TypedColumn[T, E],
    cf: TypedColumn[T, F],
    cg: TypedColumn[T, G],
    ch: TypedColumn[T, H],
    ci: TypedColumn[T, I],
    cj: TypedColumn[T, J]
  ): TypedDataset[(A, B, C, D, E, F, G, H, I, J)] = {
    implicit val (ea, eb, ec, ed, ee, ef, eg, eh, ei, ej) =
      (ca.uencoder, cb.uencoder, cc.uencoder, cd.uencoder, ce.uencoder, cf.uencoder, cg.uencoder, ch.uencoder, ci.uencoder, cj.uencoder)
    selectMany(ca, cb, cc, cd, ce, cf, cg, ch, ci, cj)
  }

  object selectMany extends ProductArgs {
    def applyProduct[U <: HList, Out0 <: HList, Out](columns: U)
      (implicit
        i0: ColumnTypes.Aux[T, U, Out0],
        i1: ToTraversable.Aux[U, List, UntypedExpression[T]],
        i2: Tupler.Aux[Out0, Out],
        i3: TypedEncoder[Out]
      ): TypedDataset[Out] = {
        val base = dataset.toDF()
          .select(columns.toList[UntypedExpression[T]].map(c => new Column(c.expr)):_*)
        val selected = base.as[Out](TypedExpressionEncoder[Out])

        TypedDataset.create[Out](selected)
      }
  }

  /** Sort each partition in the dataset using the columns selected. */
  def sortWithinPartitions[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] =
    sortWithinPartitionsMany(ca)

  /** Sort each partition in the dataset using the columns selected. */
  def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered](
    ca: SortedTypedColumn[T, A],
    cb: SortedTypedColumn[T, B]
  ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb)

  /** Sort each partition in the dataset using the columns selected. */
  def sortWithinPartitions[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered](
    ca: SortedTypedColumn[T, A],
    cb: SortedTypedColumn[T, B],
    cc: SortedTypedColumn[T, C]
  ): TypedDataset[T] = sortWithinPartitionsMany(ca, cb, cc)

  /** Sort each partition in the dataset by the given column expressions
    * Default sort order is ascending.
    * {{{
    *   d.sortWithinPartitionsMany(d('a), d('b).desc, d('c).asc)
    * }}}
    */
  object sortWithinPartitionsMany extends ProductArgs {
    def applyProduct[U <: HList, O <: HList](columns: U)
      (implicit
        i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O],
        i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]]
      ): TypedDataset[T] = {
      val sorted = dataset.toDF()
        .sortWithinPartitions(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*)
        .as[T](TypedExpressionEncoder[T])

      TypedDataset.create[T](sorted)
    }
  }

  /** Orders the TypedDataset using the column selected. */
  def orderBy[A: CatalystOrdered](ca: SortedTypedColumn[T, A]): TypedDataset[T] =
    orderByMany(ca)

  /** Orders the TypedDataset using the columns selected. */
  def orderBy[A: CatalystOrdered, B: CatalystOrdered](
    ca: SortedTypedColumn[T, A],
    cb: SortedTypedColumn[T, B]
  ): TypedDataset[T] = orderByMany(ca, cb)

 /** Orders the TypedDataset using the columns selected. */
 def orderBy[A: CatalystOrdered, B: CatalystOrdered, C: CatalystOrdered](
   ca: SortedTypedColumn[T, A],
   cb: SortedTypedColumn[T, B],
   cc: SortedTypedColumn[T, C]
 ): TypedDataset[T] = orderByMany(ca, cb, cc)

  /** Sort the dataset by any number of column expressions.
    * Default sort order is ascending.
    * {{{
    *   d.orderByMany(d('a), d('b).desc, d('c).asc)
    * }}}
    */
  object orderByMany extends ProductArgs {
    def applyProduct[U <: HList, O <: HList](columns: U)
      (implicit
        i0: Mapper.Aux[SortedTypedColumn.defaultAscendingPoly.type, U, O],
        i1: ToTraversable.Aux[O, List, SortedTypedColumn[T, _]]
      ): TypedDataset[T] = {
      val sorted = dataset.toDF()
        .orderBy(i0(columns).toList[SortedTypedColumn[T, _]].map(_.untyped):_*)
        .as[T](TypedExpressionEncoder[T])

      TypedDataset.create[T](sorted)
    }
  }

  /** Returns a new Dataset as a tuple with the specified
    * column dropped.
    * Does not allow for dropping from a single column TypedDataset
    *
    * {{{
    *   val d: TypedDataset[Foo(a: String, b: Int...)] = ???
    *   val result = TypedDataset[(Int, ...)] = d.drop('a)
    * }}}
    * @param column column to drop specified as a Symbol
    * @param i0 LabelledGeneric derived for T
    * @param i1 Remover derived for TRep and column
    * @param i2 values of T with column removed
    * @param i3 tupler of values
    * @param i4 evidence of encoder of the tupled values
    * @tparam Out Tupled return type
    * @tparam TRep shapeless' record representation of T
    * @tparam Removed record of T with column removed
    * @tparam ValuesFromRemoved values of T with column removed as an HList
    * @tparam V value type of column in T
    * @return
    */
  def dropTupled[Out, TRep <: HList, Removed <: HList, ValuesFromRemoved <: HList, V]
    (column: Witness.Lt[Symbol])
    (implicit
      i0: LabelledGeneric.Aux[T, TRep],
      i1: Remover.Aux[TRep, column.T, (V, Removed)],
      i2: Values.Aux[Removed, ValuesFromRemoved],
      i3: Tupler.Aux[ValuesFromRemoved, Out],
      i4: TypedEncoder[Out]
    ): TypedDataset[Out] = {
      val dropped = dataset
        .toDF()
        .drop(column.value.name)
        .as[Out](TypedExpressionEncoder[Out])

      TypedDataset.create[Out](dropped)
    }

  /**
    * Drops columns as necessary to return `U`
    *
    * @example
    * {{{
    *   case class X(i: Int, j: Int, k: Boolean)
    *   case class Y(i: Int, k: Boolean)
    *   val f: TypedDataset[X] = ???
    *   val fNew: TypedDataset[Y] = f.drop[Y]
    * }}}
    *
    * @tparam U the output type
    *
    * @see [[frameless.TypedDataset#project]]
    */
  def drop[U](implicit projector: SmartProject[T,U]): TypedDataset[U] = project[U]

  /** Prepends a new column to the Dataset.
    *
    * {{{
    *   case class X(i: Int, j: Int)
    *   val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    *   val fNew: TypedDataset[(Int,Int,Boolean)] = f.withColumnTupled(f('j) === 10)
    * }}}
    */
  def withColumnTupled[A: TypedEncoder, H <: HList, FH <: HList, Out]
    (ca: TypedColumn[T, A])
    (implicit
      i0: Generic.Aux[T, H],
      i1: Prepend.Aux[H, A :: HNil, FH],
      i2: Tupler.Aux[FH, Out],
      i3: TypedEncoder[Out]
    ): TypedDataset[Out] = {
      // Giving a random name to the new column (the proper name will be given by the Tuple-based encoder)
      val selected = dataset.toDF().withColumn("I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI", ca.untyped)
        .as[Out](TypedExpressionEncoder[Out])

      TypedDataset.create[Out](selected)
  }

  /** Returns a new [[frameless.TypedDataset]] with the specified column updated with a new value
    * {{{
    *   case class X(i: Int, j: Int)
    *   val f: TypedDataset[X] = TypedDataset.create(X(1,10) :: Nil)
    *   val fNew: TypedDataset[X] = f.withColumn('j, f('i)) // results in X(1, 1) :: Nil
    * }}}
    * @param column column given as a symbol to replace
    * @param replacement column to replace the value with
    * @param i0 Evidence that a column with the correct type and name exists
    */
  def withColumnReplaced[A](
    column: Witness.Lt[Symbol],
    replacement: TypedColumn[T, A]
  )(implicit
    i0: TypedColumn.Exists[T, column.T, A]
  ): TypedDataset[T] = {
    val updated = dataset.toDF().withColumn(column.value.name, replacement.untyped)
      .as[T](TypedExpressionEncoder[T])

    TypedDataset.create[T](updated)
  }

  /** Adds a column to a Dataset so long as the specified output type, `U`, has
    * an extra column from `T` that has type `A`.
    *
    * @example
    * {{{
    *   case class X(i: Int, j: Int)
    *   case class Y(i: Int, j: Int, k: Boolean)
    *   val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    *   val fNew: TypedDataset[Y] = f.withColumn[Y](f('j) === 10)
    * }}}
    * @param ca The typed column to add
    * @param i0 TypeEncoder for output type U
    * @param i1 TypeEncoder for added column type A
    * @param i2 the LabelledGeneric derived for T
    * @param i3 the LabelledGeneric derived for U
    * @param i4 proof no fields have been removed
    * @param i5 diff from T to U
    * @param i6 keys from newFields
    * @param i7 the one and only new key
    * @param i8 the one and only new field enforcing the type of A exists
    * @param i9 the keys of U
    * @param iA allows for traversing the keys of U
    * @tparam U the output type
    * @tparam A The added column type
    * @tparam TRep shapeless' record representation of T
    * @tparam URep shapeless' record representation of U
    * @tparam UKeys the keys of U as an HList
    * @tparam NewFields the added fields to T to get U
    * @tparam NewKeys the keys of NewFields as an HList
    * @tparam NewKey the first, and only, key in NewKey
    *
    * @see [[frameless.TypedDataset.WithColumnApply#apply]]
    */
  def withColumn[U] = new WithColumnApply[U]

  class WithColumnApply[U] {
    def apply[A, TRep <: HList, URep <: HList, UKeys <: HList, NewFields <: HList, NewKeys <: HList, NewKey <: Symbol]
    (ca: TypedColumn[T, A])
    (implicit
      i0: TypedEncoder[U],
      i1: TypedEncoder[A],
      i2: LabelledGeneric.Aux[T, TRep],
      i3: LabelledGeneric.Aux[U, URep],
      i4: Diff.Aux[TRep, URep, HNil],
      i5: Diff.Aux[URep, TRep, NewFields],
      i6: Keys.Aux[NewFields, NewKeys],
      i7: IsHCons.Aux[NewKeys, NewKey, HNil],
      i8: IsHCons.Aux[NewFields, FieldType[NewKey, A], HNil],
      i9: Keys.Aux[URep, UKeys],
      iA: ToTraversable.Aux[UKeys, Seq, Symbol]
    ): TypedDataset[U] = {
      val newColumnName =
        i7.head(i6()).name

      val dfWithNewColumn = dataset
        .toDF()
        .withColumn(newColumnName, ca.untyped)

      val newColumns = i9.apply().to[Seq].map(_.name).map(dfWithNewColumn.col)

      val selected = dfWithNewColumn
        .select(newColumns: _*)
        .as[U](TypedExpressionEncoder[U])

      TypedDataset.create[U](selected)
    }
  }

  /**
    * Explodes a single column at a time. It only compiles if the type of column supports this operation.
    *
    * @example
    *
    * {{{
    *   case class X(i: Int, j: Array[Int])
    *   case class Y(i: Int, j: Int)
    *
    *   val f: TypedDataset[X] = ???
    *   val fNew: TypedDataset[Y] = f.explode('j).as[Y]
    * }}}
    * @param column the column we wish to explode
    */
  def explode[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out]
  (column: Witness.Lt[Symbol])
  (implicit
   i0: TypedColumn.Exists[T, column.T, V[A]],
   i1: TypedEncoder[A],
   i2: CatalystExplodableCollection[V],
   i3: LabelledGeneric.Aux[T, TRep],
   i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod],
   i5: Values.Aux[OutMod, OutModValues],
   i6: Tupler.Aux[OutModValues, Out],
   i7: TypedEncoder[Out]
  ): TypedDataset[Out] = {
    import org.apache.spark.sql.functions.{explode => sparkExplode}
    val df = dataset.toDF()

    val trans =
      df
        .withColumn(column.value.name, sparkExplode(df(column.value.name)))
        .as[Out](TypedExpressionEncoder[Out])
    TypedDataset.create[Out](trans)
  }

  /**
    * Explodes a single column at a time. It only compiles if the type of column supports this operation.
    *
    * @example
    *
    * {{{
    *   case class X(i: Int, j: Map[Int, Int])
    *   case class Y(i: Int, j: (Int, Int))
    *
    *   val f: TypedDataset[X] = ???
    *   val fNew: TypedDataset[Y] = f.explodeMap('j).as[Y]
    * }}}
    * @param column the column we wish to explode
    */
  def explodeMap[A, B, V[_, _], TRep <: HList, OutMod <: HList, OutModValues <: HList, Out]
  (column: Witness.Lt[Symbol])
  (implicit
   i0: TypedColumn.Exists[T, column.T, V[A, B]],
   i1: TypedEncoder[A],
   i2: TypedEncoder[B],
   i3: LabelledGeneric.Aux[T, TRep],
   i4: Modifier.Aux[TRep, column.T, V[A,B], (A, B), OutMod],
   i5: Values.Aux[OutMod, OutModValues],
   i6: Tupler.Aux[OutModValues, Out],
   i7: TypedEncoder[Out]
  ): TypedDataset[Out] = {
    import org.apache.spark.sql.functions.{explode => sparkExplode, struct => sparkStruct, col => sparkCol}
    val df = dataset.toDF()

    // select all columns, all original columns and [key, value] columns appeared after the map explode
    // .withColumn(column.value.name, sparkExplode(df(column.value.name))) in this case would not work
    // since the map explode produces two columns
    val columnNames = df.columns.toSeq
    val columnNamesRenamed = columnNames.map(c => s"frameless_$c")

    // preserve the original list of renamed columns
    val columns = columnNamesRenamed.map(sparkCol)

    val columnRenamed = s"frameless_${column.value.name}"
    // explode of a map adds "key" and "value" columns into the Row
    // this may cause col namings collision: row could already contain key / value columns
    // we rename the original Row columns to avoid this collision
    val dfr = df.toDF(columnNamesRenamed: _*)
    val exploded = dfr.select(sparkCol("*"), sparkExplode(dfr(columnRenamed)))
    val trans =
      exploded
        // map explode explodes it into [key, value] columns
        // the only way to put it into a column is to create a struct
        .withColumn(columnRenamed, sparkStruct(exploded("key"), exploded("value")))
        // selecting only original columns, we don't need [key, value] columns left in the DataFrame after the map explode
        .select(columns: _*)
        // rename columns back and form the result
        .toDF(columnNames: _*)
        .as[Out](TypedExpressionEncoder[Out])
    TypedDataset.create[Out](trans)
  }

  /**
    * Flattens a column of type Option[A]. Compiles only if the selected column is of type Option[A].
    *
    *
    * @example
    *
    * {{{
    *   case class X(i: Int, j: Option[Int])
    *   case class Y(i: Int, j: Int)
    *
    *   val f: TypedDataset[X] = ???
    *   val fNew: TypedDataset[Y] = f.flattenOption('j).as[Y]
    * }}}
    *
    * @param column the column we wish to flatten
    */
  def flattenOption[A, TRep <: HList, V[_], OutMod <: HList, OutModValues <: HList, Out]
  (column: Witness.Lt[Symbol])
  (implicit
   i0: TypedColumn.Exists[T, column.T, V[A]],
   i1: TypedEncoder[A],
   i2: V[A] =:= Option[A],
   i3: LabelledGeneric.Aux[T, TRep],
   i4: Modifier.Aux[TRep, column.T, V[A], A, OutMod],
   i5: Values.Aux[OutMod, OutModValues],
   i6: Tupler.Aux[OutModValues, Out],
   i7: TypedEncoder[Out]
  ): TypedDataset[Out] = {
    val df = dataset.toDF()
    val trans = df.filter(df(column.value.name).isNotNull).
      as[Out](TypedExpressionEncoder[Out])

    TypedDataset.create[Out](trans)
  }
}

object TypedDataset {
  def create[A](data: Seq[A])
    (implicit
      encoder: TypedEncoder[A],
      sqlContext: SparkSession
    ): TypedDataset[A] = {
      val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A])

      TypedDataset.create[A](dataset)
    }

  def create[A](data: RDD[A])
    (implicit
      encoder: TypedEncoder[A],
      sqlContext: SparkSession
    ): TypedDataset[A] = {
      val dataset = sqlContext.createDataset(data)(TypedExpressionEncoder[A])

      TypedDataset.create[A](dataset)
    }

  def create[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] =
    createUnsafe(dataset.toDF())

  /**
    * Creates a [[frameless.TypedDataset]] from a Spark [[org.apache.spark.sql.DataFrame]].
    * Note that the names and types need to align!
    *
    * This is an unsafe operation: If the schemas do not align,
    * the error will be captured at runtime (not during compilation).
    */
  def createUnsafe[A: TypedEncoder](df: DataFrame): TypedDataset[A] = {
    val e = TypedEncoder[A]
    val output: Seq[Attribute] = df.queryExecution.analyzed.output

    val targetFields = TypedExpressionEncoder.targetStructType(e)
    val targetColNames: Seq[String] = targetFields.map(_.name)

    if (output.size != targetFields.size) {
      throw new IllegalStateException(
        s"Unsupported creation of TypedDataset with ${targetFields.size} column(s) " +
          s"from a DataFrame with ${output.size} columns. " +
          "Try to `select()` the proper columns in the right order before calling `create()`.")
    }

    // Adapt names if they are not the same (note: types still might not match)
    val shouldReshape = output.zip(targetColNames).exists {
      case (expr, colName) => expr.name != colName
    }
    val canSelect = targetColNames.toSet.subsetOf(output.map(_.name).toSet)

    val reshaped = if (shouldReshape && canSelect) {
      df.select(targetColNames.head, targetColNames.tail:_*)
    } else if (shouldReshape) {
      df.toDF(targetColNames: _*)
    } else {
      df
    }

    new TypedDataset[A](reshaped.as[A](TypedExpressionEncoder[A]))
  }

  /** Prefer `TypedDataset.create` over `TypedDataset.unsafeCreate` unless you
    * know what you are doing. */
  @deprecated("Prefer TypedDataset.create over TypedDataset.unsafeCreate", "0.3.0")
  def unsafeCreate[A: TypedEncoder](dataset: Dataset[A]): TypedDataset[A] = {
    new TypedDataset[A](dataset)
  }
}


================================================
FILE: dataset/src/main/scala/frameless/TypedDatasetForwarded.scala
================================================
package frameless

import java.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.streaming.DataStreamWriter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameWriter, SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel

import scala.util.Random

/** This trait implements [[TypedDataset]] methods that have the same signature
  * than their `Dataset` equivalent. Each method simply forwards the call to the
  * underlying `Dataset`.
  *
  * Documentation marked "apache/spark" is thanks to apache/spark Contributors
  * at https://github.com/apache/spark, licensed under Apache v2.0 available at
  * http://www.apache.org/licenses/LICENSE-2.0
  */
trait TypedDatasetForwarded[T] { self: TypedDataset[T] =>

  override def toString: String =
    dataset.toString

  /**
    * Returns a `SparkSession` from this [[TypedDataset]].
    */
  def sparkSession: SparkSession =
    dataset.sparkSession

  /**
    * Returns a `SQLContext` from this [[TypedDataset]].
    */
  def sqlContext: SQLContext =
    dataset.sqlContext

  /**
    * Returns the schema of this Dataset.
    *
    * apache/spark
    */
  def schema: StructType =
    dataset.schema

  /** Prints the schema of the underlying `Dataset` to the console in a nice tree format.
    *
    * apache/spark
   */
  def printSchema(): Unit =
    dataset.printSchema()

  /** Prints the plans (logical and physical) to the console for debugging purposes.
    *
    * apache/spark
   */
  def explain(extended: Boolean = false): Unit =
    dataset.explain(extended)

  /**
    * Returns a `QueryExecution` from this [[TypedDataset]].
    *
    * It is the primary workflow for executing relational queries using Spark.  Designed to allow easy
    * access to the intermediate phases of query execution for developers.
    *
    * apache/spark
    */
  def queryExecution: QueryExecution =
    dataset.queryExecution

  /** Converts this strongly typed collection of data to generic Dataframe.  In contrast to the
    * strongly typed objects that Dataset operations work on, a Dataframe returns generic Row
    * objects that allow fields to be accessed by ordinal or name.
    *
    * apache/spark
    */
  def toDF(): DataFrame =
    dataset.toDF()

  /** Converts this [[TypedDataset]] to an RDD.
    *
    * apache/spark
    */
  def rdd: RDD[T] =
    dataset.rdd

  /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions.
    *
    * apache/spark
    */
  def repartition(numPartitions: Int): TypedDataset[T] =
    TypedDataset.create(dataset.repartition(numPartitions))


  /**
    * Get the [[TypedDataset]]'s current storage level, or StorageLevel.NONE if not persisted.
    *
    * apache/spark
    */
  def storageLevel(): StorageLevel =
    dataset.storageLevel

  /**
    * Returns the content of the [[TypedDataset]] as a Dataset of JSON strings.
    *
    * apache/spark
    */
  def toJSON: TypedDataset[String] =
    TypedDataset.create(dataset.toJSON)

  /**
    * Interface for saving the content of the non-streaming [[TypedDataset]] out into external storage.
    *
    * apache/spark
    */
  def write: DataFrameWriter[T] =
    dataset.write

  /**
    * Interface for saving the content of the streaming Dataset out into external storage.
    *
    * apache/spark
    */
  def writeStream: DataStreamWriter[T] =
    dataset.writeStream
    
  /** Returns a new [[TypedDataset]] that has exactly `numPartitions` partitions.
    * Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g.
    * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
    * the 100 new partitions will claim 10 of the current partitions.
    *
    * apache/spark
    */
  def coalesce(numPartitions: Int): TypedDataset[T] =
    TypedDataset.create(dataset.coalesce(numPartitions))

  /**
    * Returns an `Array` that contains all column names in this [[TypedDataset]].
    */
  def columns: Array[String] =
    dataset.columns

  /** Concise syntax for chaining custom transformations.
    *
    * apache/spark
    */
  def transform[U](t: TypedDataset[T] => TypedDataset[U]): TypedDataset[U] =
    t(this)

  /** Returns a new Dataset by taking the first `n` rows. The difference between this function
    * and `head` is that `head` is an action and returns an array (by triggering query execution)
    * while `limit` returns a new Dataset.
    *
    * apache/spark
    */
  def limit(n: Int): TypedDataset[T] =
    TypedDataset.create(dataset.limit(n))

  /** Returns a new [[TypedDataset]] by sampling a fraction of records.
    *
    * apache/spark
    */
  def sample(withReplacement: Boolean, fraction: Double, seed: Long = Random.nextLong()): TypedDataset[T] =
    TypedDataset.create(dataset.sample(withReplacement, fraction, seed))

  /** Returns a new [[TypedDataset]] that contains only the unique elements of this [[TypedDataset]].
    *
    * Note that, equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * apache/spark
    */
  def distinct: TypedDataset[T] =
    TypedDataset.create(dataset.distinct())

  /**
    * Returns a best-effort snapshot of the files that compose this [[TypedDataset]]. This method simply
    * asks each constituent BaseRelation for its respective files and takes the union of all results.
    * Depending on the source relations, this may not find all input files. Duplicates are removed.
    *
    * apache/spark
    */

  def inputFiles: Array[String] =
    dataset.inputFiles

  /**
    * Returns true if the `collect` and `take` methods can be run locally
    * (without any Spark executors).
    *
    * apache/spark
    */
  def isLocal: Boolean =
    dataset.isLocal

  /**
    * Returns true if this [[TypedDataset]] contains one or more sources that continuously
    * return data as it arrives. A [[TypedDataset]] that reads data from a streaming source
    * must be executed as a `StreamingQuery` using the `start()` method in
    * `DataStreamWriter`. Methods that return a single answer, e.g. `count()` or
    * `collect()`, will throw an `AnalysisException` when there is a streaming
    * source present.
    *
    * apache/spark
    */
  def isStreaming: Boolean =
    dataset.isStreaming

  /** Returns a new [[TypedDataset]] that contains only the elements of this [[TypedDataset]] that are also
    * present in `other`.
    *
    * Note that, equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * apache/spark
    */
  def intersect(other: TypedDataset[T]): TypedDataset[T] =
    TypedDataset.create(dataset.intersect(other.dataset))

  /**
    * Randomly splits this [[TypedDataset]] with the provided weights.
    * Weights for splits, will be normalized if they don't sum to 1.
    *
    * apache/spark
    */
  // $COVERAGE-OFF$ We can not test this method because it is non-deterministic.
  def randomSplit(weights: Array[Double]): Array[TypedDataset[T]] =
    dataset.randomSplit(weights).map(TypedDataset.create[T])
  // $COVERAGE-ON$

  /**
    * Randomly splits this [[TypedDataset]] with the provided weights.
    * Weights for splits, will be normalized if they don't sum to 1.
    *
    * apache/spark
    */
  def randomSplit(weights: Array[Double], seed: Long): Array[TypedDataset[T]] =
    dataset.randomSplit(weights, seed).map(TypedDataset.create[T])

  /**
    * Returns a Java list that contains randomly split [[TypedDataset]] with the provided weights.
    * Weights for splits, will be normalized if they don't sum to 1.
    *
    * apache/spark
    */
  def randomSplitAsList(weights: Array[Double], seed: Long): util.List[TypedDataset[T]] = {
    val values = randomSplit(weights, seed)
    java.util.Arrays.asList(values: _*)
  }


  /** Returns a new Dataset containing rows in this Dataset but not in another Dataset.
    * This is equivalent to `EXCEPT` in SQL.
    *
    * Note that, equality checking is performed directly on the encoded representation of the data
    * and thus is not affected by a custom `equals` function defined on `T`.
    *
    * apache/spark
    */
  def except(other: TypedDataset[T]): TypedDataset[T] =
    TypedDataset.create(dataset.except(other.dataset))

  /** Persist this [[TypedDataset]] with the default storage level (`MEMORY_AND_DISK`).
    *
    * apache/spark
    */
  def cache(): TypedDataset[T] =
    TypedDataset.create(dataset.cache())

  /** Persist this [[TypedDataset]] with the given storage level.
    * @param newLevel One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`,
    *   `MEMORY_AND_DISK_SER`, `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc.
    *
    * apache/spark
    */
  def persist(newLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK): TypedDataset[T] =
    TypedDataset.create(dataset.persist(newLevel))

  /** Mark the [[TypedDataset]] as non-persistent, and remove all blocks for it from memory and disk.
    * @param blocking Whether to block until all blocks are deleted.
    *
    * apache/spark
    */
  def unpersist(blocking: Boolean = false): TypedDataset[T] =
    TypedDataset.create(dataset.unpersist(blocking))

  // $COVERAGE-OFF$ We do not test deprecated method since forwarded methods are tested.
  @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
  def map[U: TypedEncoder](func: T => U): TypedDataset[U] =
    deserialized.map(func)

  @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
  def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] =
    deserialized.mapPartitions(func)

  @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
  def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] =
    deserialized.flatMap(func)

  @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
  def filter(func: T => Boolean): TypedDataset[T] =
    deserialized.filter(func)

  @deprecated("deserialized methods have moved to a separate section to highlight their runtime overhead", "0.4.0")
  def reduceOption[F[_]: SparkDelay](func: (T, T) => T): F[Option[T]] =
    deserialized.reduceOption(func)
  // $COVERAGE-ON$

  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    *
    * @example The correct way to do a projection on a single column is to
    *          use the `select` method as follows:
    *
    *          {{{
    *           ds: TypedDataset[(String, String, String)] -> ds.select(ds('_2)).run()
    *          }}}
    *
    *          Spark provides an alternative way to obtain the same resulting `Dataset`,
    *          using the `map` method:
    *
    *          {{{
    *           ds: TypedDataset[(String, String, String)] -> ds.deserialized.map(_._2).run()
    *          }}}
    *
    *          This second approach is however substantially slower than the first one,
    *          and should be avoided as possible. Indeed, under the hood this `map` will
    *          deserialize the entire `Tuple3` to an full JVM object, call the apply
    *          method of the `_._2` closure on it, and serialize the resulting String back
    *          to its Catalyst representation.
    */
  object deserialized {
    /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each element.
      *
      * apache/spark
      */
    def map[U: TypedEncoder](func: T => U): TypedDataset[U] =
      TypedDataset.create(self.dataset.map(func)(TypedExpressionEncoder[U]))

    /** Returns a new [[TypedDataset]] that contains the result of applying `func` to each partition.
      *
      * apache/spark
      */
    def mapPartitions[U: TypedEncoder](func: Iterator[T] => Iterator[U]): TypedDataset[U] =
      TypedDataset.create(self.dataset.mapPartitions(func)(TypedExpressionEncoder[U]))

    /** Returns a new [[TypedDataset]] by first applying a function to all elements of this [[TypedDataset]],
      * and then flattening the results.
      *
      * apache/spark
      */
    def flatMap[U: TypedEncoder](func: T => TraversableOnce[U]): TypedDataset[U] =
      TypedDataset.create(self.dataset.flatMap(func)(TypedExpressionEncoder[U]))

    /** Returns a new [[TypedDataset]] that only contains elements where `func` returns `true`.
      *
      * apache/spark
      */
    def filter(func: T => Boolean): TypedDataset[T] =
      TypedDataset.create(self.dataset.filter(func))

    /** Optionally reduces the elements of this [[TypedDataset]] using the specified binary function. The given
      * `func` must be commutative and associative or the result may be non-deterministic.
      *
      * Differs from `Dataset#reduce` by wrapping its result into an `Option` and an effect-suspending `F`.
      */
    def reduceOption[F[_]](func: (T, T) => T)(implicit F: SparkDelay[F]): F[Option[T]] =
      F.delay {
        try {
          Option(self.dataset.reduce(func))
        } catch {
          case _: UnsupportedOperationException => None
        }
      }(self.dataset.sparkSession)
  }
}


================================================
FILE: dataset/src/main/scala/frameless/TypedEncoder.scala
================================================
package frameless

import java.math.BigInteger

import java.util.Date

import java.time.{ Duration, Instant, Period, LocalDate }

import java.sql.Timestamp

import scala.reflect.ClassTag

import org.apache.spark.sql.FramelessInternals
import org.apache.spark.sql.FramelessInternals.UserDefinedType
import org.apache.spark.sql.{ reflection => ScalaReflection }
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects._
import org.apache.spark.sql.catalyst.util.{
  ArrayBasedMapData,
  DateTimeUtils,
  GenericArrayData
}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import shapeless._
import shapeless.ops.hlist.IsHCons

abstract class TypedEncoder[T](
    implicit
    val classTag: ClassTag[T])
    extends Serializable {
  def nullable: Boolean

  def jvmRepr: DataType
  def catalystRepr: DataType

  /**
   * From Catalyst representation to T
   */
  def fromCatalyst(path: Expression): Expression

  /**
   * T to Catalyst representation
   */
  def toCatalyst(path: Expression): Expression
}

// Waiting on scala 2.12
// @annotation.implicitAmbiguous(msg =
// """TypedEncoder[${T}] can be obtained from automatic type class derivation, using the implicit Injection[${T}, ?] or using the implicit UserDefinedType[${T}] in scope.
// To desambigious this resolution you need to either:
//   - Remove the implicit Injection[${T}, ?] from scope
//   - Remove the implicit UserDefinedType[${T}] from scope
//   - import TypedEncoder.usingInjection
//   - import TypedEncoder.usingDerivation
//   - import TypedEncoder.usingUserDefinedType
// """)
object TypedEncoder {
  def apply[T: TypedEncoder]: TypedEncoder[T] = implicitly[TypedEncoder[T]]

  implicit val stringEncoder: TypedEncoder[String] = new TypedEncoder[String] {
    def nullable: Boolean = false

    def jvmRepr: DataType = FramelessInternals.objectTypeFor[String]
    def catalystRepr: DataType = StringType

    def toCatalyst(path: Expression): Expression =
      StaticInvoke(classOf[UTF8String], catalystRepr, "fromString", path :: Nil)

    def fromCatalyst(path: Expression): Expression =
      Invoke(path, "toString", jvmRepr)

    override val toString = "stringEncoder"
  }

  implicit val booleanEncoder: TypedEncoder[Boolean] =
    new TypedEncoder[Boolean] {
      def nullable: Boolean = false

      def jvmRepr: DataType = BooleanType
      def catalystRepr: DataType = BooleanType

      def toCatalyst(path: Expression): Expression = path
      def fromCatalyst(path: Expression): Expression = path
    }

  implicit val intEncoder: TypedEncoder[Int] = new TypedEncoder[Int] {
    def nullable: Boolean = false

    def jvmRepr: DataType = IntegerType
    def catalystRepr: DataType = IntegerType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path

    override def toString = "intEncoder"
  }

  implicit val longEncoder: TypedEncoder[Long] = new TypedEncoder[Long] {
    def nullable: Boolean = false

    def jvmRepr: DataType = LongType
    def catalystRepr: DataType = LongType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path
  }

  implicit val shortEncoder: TypedEncoder[Short] = new TypedEncoder[Short] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ShortType
    def catalystRepr: DataType = ShortType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path
  }

  implicit val charEncoder: TypedEncoder[Char] = new TypedEncoder[Char] {

    // tricky because while Char is primitive type, Spark doesn't support it
    implicit val charAsString: Injection[java.lang.Character, String] =
      new Injection[java.lang.Character, String] {
        def apply(a: java.lang.Character): String = String.valueOf(a)

        def invert(b: String): java.lang.Character = {
          require(b.length == 1)
          b.charAt(0)
        }
      }

    val underlying = usingInjection[java.lang.Character, String]

    def nullable: Boolean = false

    // this line fixes underlying encoder
    def jvmRepr: DataType =
      FramelessInternals.objectTypeFor[java.lang.Character]

    def catalystRepr: DataType = StringType

    def toCatalyst(path: Expression): Expression = underlying.toCatalyst(path)

    def fromCatalyst(path: Expression): Expression =
      underlying.fromCatalyst(path)
  }

  implicit val byteEncoder: TypedEncoder[Byte] = new TypedEncoder[Byte] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ByteType
    def catalystRepr: DataType = ByteType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path
  }

  implicit val floatEncoder: TypedEncoder[Float] = new TypedEncoder[Float] {
    def nullable: Boolean = false

    def jvmRepr: DataType = FloatType
    def catalystRepr: DataType = FloatType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path
  }

  implicit val doubleEncoder: TypedEncoder[Double] = new TypedEncoder[Double] {
    def nullable: Boolean = false

    def jvmRepr: DataType = DoubleType
    def catalystRepr: DataType = DoubleType

    def toCatalyst(path: Expression): Expression = path
    def fromCatalyst(path: Expression): Expression = path
  }

  implicit val bigDecimalEncoder: TypedEncoder[BigDecimal] =
    new TypedEncoder[BigDecimal] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigDecimal]
      def catalystRepr: DataType = DecimalType.SYSTEM_DEFAULT

      def toCatalyst(path: Expression): Expression =
        StaticInvoke(
          Decimal.getClass,
          DecimalType.SYSTEM_DEFAULT,
          "apply",
          path :: Nil
        )

      def fromCatalyst(path: Expression): Expression =
        Invoke(path, "toBigDecimal", jvmRepr)

      override def toString: String = "bigDecimalEncoder"
    }

  implicit val javaBigDecimalEncoder: TypedEncoder[java.math.BigDecimal] =
    new TypedEncoder[java.math.BigDecimal] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[java.math.BigDecimal]
      def catalystRepr: DataType = DecimalType.SYSTEM_DEFAULT

      def toCatalyst(path: Expression): Expression =
        StaticInvoke(
          Decimal.getClass,
          DecimalType.SYSTEM_DEFAULT,
          "apply",
          path :: Nil
        )

      def fromCatalyst(path: Expression): Expression =
        Invoke(path, "toJavaBigDecimal", jvmRepr)

      override def toString: String = "javaBigDecimalEncoder"
    }

  implicit val bigIntEncoder: TypedEncoder[BigInt] = new TypedEncoder[BigInt] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigInt]
    def catalystRepr: DataType = DecimalType(DecimalType.MAX_PRECISION, 0)

    def toCatalyst(path: Expression): Expression =
      StaticInvoke(
        Decimal.getClass,
        catalystRepr,
        "apply",
        path :: Nil
      )

    def fromCatalyst(path: Expression): Expression =
      Invoke(path, "toScalaBigInt", jvmRepr)

    override def toString: String = "bigIntEncoder"
  }

  implicit val javaBigIntEncoder: TypedEncoder[BigInteger] =
    new TypedEncoder[BigInteger] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[BigInteger]
      def catalystRepr: DataType = DecimalType(DecimalType.MAX_PRECISION, 0)

      def toCatalyst(path: Expression): Expression =
        StaticInvoke(
          Decimal.getClass,
          catalystRepr,
          "apply",
          path :: Nil
        )

      def fromCatalyst(path: Expression): Expression =
        Invoke(path, "toJavaBigInteger", jvmRepr)

      override def toString: String = "javaBigIntEncoder"
    }

  implicit val sqlDate: TypedEncoder[SQLDate] = new TypedEncoder[SQLDate] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ScalaReflection.dataTypeFor[SQLDate]
    def catalystRepr: DataType = DateType

    def toCatalyst(path: Expression): Expression =
      Invoke(path, "days", DateType)

    def fromCatalyst(path: Expression): Expression =
      StaticInvoke(
        staticObject = SQLDate.getClass,
        dataType = jvmRepr,
        functionName = "apply",
        arguments = path :: Nil,
        propagateNull = true
      )
  }

  implicit val timestampEncoder: TypedEncoder[Timestamp] =
    new TypedEncoder[Timestamp] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[Timestamp]
      def catalystRepr: DataType = TimestampType

      def toCatalyst(path: Expression): Expression =
        StaticInvoke(
          DateTimeUtils.getClass,
          TimestampType,
          "fromJavaTimestamp",
          path :: Nil,
          returnNullable = false
        )

      def fromCatalyst(path: Expression): Expression =
        StaticInvoke(
          staticObject = DateTimeUtils.getClass,
          dataType = jvmRepr,
          functionName = "toJavaTimestamp",
          arguments = path :: Nil,
          propagateNull = true
        )

      override def toString: String = "timestampEncoder"
    }

  implicit val dateEncoder: TypedEncoder[Date] = new TypedEncoder[Date] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ScalaReflection.dataTypeFor[Date]
    def catalystRepr: DataType = TimestampType

    private val instantRepr = ScalaReflection.dataTypeFor[Instant]

    def toCatalyst(path: Expression): Expression =
      timeInstant.toCatalyst(Invoke(path, "toInstant", instantRepr))

    def fromCatalyst(path: Expression): Expression =
      StaticInvoke(
        staticObject = classOf[Date],
        dataType = jvmRepr,
        functionName = "from",
        arguments = timeInstant.fromCatalyst(path) :: Nil,
        propagateNull = true
      )

    override def toString: String = "dateEncoder"
  }

  implicit val sqlDateEncoder: TypedEncoder[java.sql.Date] =
    new TypedEncoder[java.sql.Date] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[java.sql.Date]
      def catalystRepr: DataType = DateType

      def toCatalyst(path: Expression): Expression =
        StaticInvoke(
          staticObject = DateTimeUtils.getClass,
          dataType = catalystRepr,
          functionName = "fromJavaDate",
          arguments = path :: Nil,
          propagateNull = true
        )

      private val localDateRepr = ScalaReflection.dataTypeFor[LocalDate]

      def fromCatalyst(path: Expression): Expression = {
        val toLocalDate = StaticInvoke(
          staticObject = DateTimeUtils.getClass,
          dataType = localDateRepr,
          functionName = "daysToLocalDate",
          arguments = path :: Nil,
          propagateNull = true
        )

        StaticInvoke(
          staticObject = classOf[java.sql.Date],
          dataType = jvmRepr,
          functionName = "valueOf",
          arguments = toLocalDate :: Nil,
          propagateNull = true
        )
      }

      override def toString: String = "sqlDateEncoder"
    }

  implicit val sqlTimestamp: TypedEncoder[SQLTimestamp] =
    new TypedEncoder[SQLTimestamp] {
      def nullable: Boolean = false

      def jvmRepr: DataType = ScalaReflection.dataTypeFor[SQLTimestamp]
      def catalystRepr: DataType = TimestampType

      def toCatalyst(path: Expression): Expression =
        Invoke(path, "us", TimestampType)

      def fromCatalyst(path: Expression): Expression =
        StaticInvoke(
          staticObject = SQLTimestamp.getClass,
          dataType = jvmRepr,
          functionName = "apply",
          arguments = path :: Nil,
          propagateNull = true
        )
    }

  /** java.time Encoders, Spark uses https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala for encoding / decoding. */
  implicit val timeInstant: TypedEncoder[Instant] = new TypedEncoder[Instant] {
    def nullable: Boolean = false

    def jvmRepr: DataType = ScalaReflection.dataTypeFor[Instant]
    def catalystRepr: DataType = TimestampType

    def toCatalyst(path: Expression): Expression =
      StaticInvoke(
        DateTimeUtils.getClass,
        TimestampType,
        "instantToMicros",
        path :: Nil,
        returnNullable = false
      )

    def fromCatalyst(path: Expression): Expression =
      StaticInvoke(
        staticObject = DateTimeUtils.getClass,
        dataType = jvmRepr,
        functionName = "microsToInstant",
        arguments = path :: Nil,
        propagateNull = true
      )
  }

  /**
   * DayTimeIntervalType and YearMonthIntervalType in Spark 3.2.0.
   * We maintain Spark 3.x cross compilation and handle Duration and Period as an injections to be compatible with Spark versions < 3.2
   * See
   *  * https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala#L1031-L1047
   *  * https://github.com/apache/spark/blob/v3.2.0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala#L1075-L1087
   */
  // DayTimeIntervalType
  implicit val timeDurationInjection: Injection[Duration, Long] =
    Injection(_.toMillis, Duration.ofMillis)

  // YearMonthIntervalType
  implicit val timePeriodInjection: Injection[Period, Int] =
    Injection(_.getDays, Period.ofDays)

  implicit val timePeriodEncoder: TypedEncoder[Period] =
    TypedEncoder.usingInjection

  implicit val timeDurationEncoder: TypedEncoder[Duration] =
    TypedEncoder.usingInjection

  implicit def arrayEncoder[T: ClassTag](
      implicit
      i0: Lazy[RecordFieldEncoder[T]]
    ): TypedEncoder[Array[T]] =
    new TypedEncoder[Array[T]] {
      private lazy val encodeT = i0.value.encoder

      def nullable: Boolean = false

      lazy val jvmRepr: DataType = i0.value.jvmRepr match {
        case ByteType => BinaryType
        case _        => FramelessInternals.objectTypeFor[Array[T]]
      }

      lazy val catalystRepr: DataType = i0.value.jvmRepr match {
        case ByteType => BinaryType
        case _        => ArrayType(encodeT.catalystRepr, encodeT.nullable)
      }

      def toCatalyst(path: Expression): Expression = {
        val enc = i0.value

        enc.jvmRepr match {
          case IntegerType | LongType | DoubleType | FloatType | ShortType |
              BooleanType =>
            StaticInvoke(
              classOf[UnsafeArrayData],
              catalystRepr,
              "fromPrimitiveArray",
              path :: Nil
            )

          case ByteType => path

          case _ =>
            MapObjects(enc.toCatalyst, path, enc.jvmRepr, encodeT.nullable)
        }
      }

      def fromCatalyst(path: Expression): Expression =
        encodeT.jvmRepr match {
          case IntegerType => Invoke(path, "toIntArray", jvmRepr)
          case LongType    => Invoke(path, "toLongArray", jvmRepr)
          case DoubleType  => Invoke(path, "toDoubleArray", jvmRepr)
          case FloatType   => Invoke(path, "toFloatArray", jvmRepr)
          case ShortType   => Invoke(path, "toShortArray", jvmRepr)
          case BooleanType => Invoke(path, "toBooleanArray", jvmRepr)

          case ByteType => path

          case _ =>
            Invoke(
              MapObjects(
                i0.value.fromCatalyst,
                path,
                encodeT.catalystRepr,
                encodeT.nullable
              ),
              "array",
              jvmRepr
            )
        }

      override def toString: String = s"arrayEncoder($jvmRepr)"
    }

  implicit def collectionEncoder[C[X] <: Seq[X], T](
      implicit
      i0: Lazy[RecordFieldEncoder[T]],
      i1: ClassTag[C[T]]
    ): TypedEncoder[C[T]] = new TypedEncoder[C[T]] {
    private lazy val encodeT = i0.value.encoder

    def nullable: Boolean = false

    def jvmRepr: DataType = FramelessInternals.objectTypeFor[C[T]](i1)

    def catalystRepr: DataType =
      ArrayType(encodeT.catalystRepr, encodeT.nullable)

    def toCatalyst(path: Expression): Expression = {
      val enc = i0.value

      if (ScalaReflection.isNativeType(enc.jvmRepr)) {
        NewInstance(classOf[GenericArrayData], path :: Nil, catalystRepr)
      } else {
        MapObjects(enc.toCatalyst, path, enc.jvmRepr, encodeT.nullable)
      }
    }

    def fromCatalyst(path: Expression): Expression =
      MapObjects(
        i0.value.fromCatalyst,
        path,
        encodeT.catalystRepr,
        encodeT.nullable,
        Some(i1.runtimeClass) // This will cause MapObjects to build a collection of type C[_] directly
      )

    override def toString: String = s"collectionEncoder($jvmRepr)"
  }

  /**
   * @param i1 implicit lazy `RecordFieldEncoder[T]` to encode individual elements of the set.
   * @param i2 implicit `ClassTag[Set[T]]` to provide runtime information about the set type.
   * @tparam T the element type of the set.
   * @return a `TypedEncoder` instance for `Set[T]`.
   */
  implicit def setEncoder[T](
      implicit
      i1: shapeless.Lazy[RecordFieldEncoder[T]],
      i2: ClassTag[Set[T]]
    ): TypedEncoder[Set[T]] = {
    implicit val inj: Injection[Set[T], Seq[T]] = Injection(_.toSeq, _.toSet)

    TypedEncoder.usingInjection
  }

  /**
   * @tparam A the key type
   * @tparam B the value type
   * @param i0 the keys encoder
   * @param i1 the values encoder
   */
  implicit def mapEncoder[A: NotCatalystNullable, B](
      implicit
      i0: Lazy[RecordFieldEncoder[A]],
      i1: Lazy[RecordFieldEncoder[B]]
    ): TypedEncoder[Map[A, B]] = new TypedEncoder[Map[A, B]] {
    def nullable: Boolean = false

    def jvmRepr: DataType = FramelessInternals.objectTypeFor[Map[A, B]]

    private lazy val encodeA = i0.value.encoder
    private lazy val encodeB = i1.value.encoder

    lazy val catalystRepr: DataType =
      MapType(encodeA.catalystRepr, encodeB.catalystRepr, encodeB.nullable)

    def fromCatalyst(path: Expression): Expression = {
      val keyArrayType = ArrayType(encodeA.catalystRepr, containsNull = false)

      val keyData = Invoke(
        MapObjects(
          i0.value.fromCatalyst,
          Invoke(path, "keyArray", keyArrayType),
          encodeA.catalystRepr
        ),
        "array",
        FramelessInternals.objectTypeFor[Array[Any]]
      )

      val valueArrayType = ArrayType(encodeB.catalystRepr, encodeB.nullable)

      val valueData = Invoke(
        MapObjects(
          i1.value.fromCatalyst,
          Invoke(path, "valueArray", valueArrayType),
          encodeB.catalystRepr
        ),
        "array",
        FramelessInternals.objectTypeFor[Array[Any]]
      )

      StaticInvoke(
        ArrayBasedMapData.getClass,
        jvmRepr,
        "toScalaMap",
        keyData :: valueData :: Nil
      )
    }

    def toCatalyst(path: Expression): Expression = {
      val encA = i0.value
      val encB = i1.value

      ExternalMapToCatalyst(
        path,
        encA.jvmRepr,
        encA.toCatalyst,
        false,
        encB.jvmRepr,
        encB.toCatalyst,
        encodeB.nullable
      )
    }

    override def toString = s"mapEncoder($jvmRepr)"
  }

  implicit def optionEncoder[A](
      implicit
      underlying: TypedEncoder[A]
    ): TypedEncoder[Option[A]] =
    new TypedEncoder[Option[A]] {
      def nullable: Boolean = true

      def jvmRepr: DataType =
        FramelessInternals.objectTypeFor[Option[A]](classTag)

      def catalystRepr: DataType = underlying.catalystRepr

      def toCatalyst(path: Expression): Expression = {
        // for primitive types we must manually unbox the value of the object
        underlying.jvmRepr match {
          case IntegerType =>
            Invoke(
              UnwrapOption(
                ScalaReflection.dataTypeFor[java.lang.Integer],
                path
              ),
              "intValue",
              IntegerType
            )

          case LongType =>
            Invoke(
              UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Long], path),
              "longValue",
              LongType
            )

          case DoubleType =>
            Invoke(
              UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Double], path),
              "doubleValue",
              DoubleType
            )

          case FloatType =>
            Invoke(
              UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Float], path),
              "floatValue",
              FloatType
            )

          case ShortType =>
            Invoke(
              UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Short], path),
              "shortValue",
              ShortType
            )

          case ByteType =>
            Invoke(
              UnwrapOption(ScalaReflection.dataTypeFor[java.lang.Byte], path),
              "byteValue",
              ByteType
            )

          case BooleanType =>
            Invoke(
              UnwrapOption(
                ScalaReflection.dataTypeFor[java.lang.Boolean],
                path
              ),
              "booleanValue",
              BooleanType
            )

          case _ =>
            underlying.toCatalyst(UnwrapOption(underlying.jvmRepr, path))
        }
      }

      def fromCatalyst(path: Expression): Expression =
        WrapOption(underlying.fromCatalyst(path), underlying.jvmRepr)
    }

  /** Encodes things using injection if there is one defined */
  implicit def usingInjection[A: ClassTag, B](
      implicit
      inj: Injection[A, B],
      trb: TypedEncoder[B]
    ): TypedEncoder[A] =
    new TypedEncoder[A] {
      def nullable: Boolean = trb.nullable
      def jvmRepr: DataType = FramelessInternals.objectTypeFor[A](classTag)
      def catalystRepr: DataType = trb.catalystRepr

      def fromCatalyst(path: Expression): Expression = {
        val bexpr = trb.fromCatalyst(path)
        Invoke(Literal.fromObject(inj), "invert", jvmRepr, Seq(bexpr))
      }

      def toCatalyst(path: Expression): Expression =
        trb.toCatalyst(
          Invoke(Literal.fromObject(inj), "apply", trb.jvmRepr, Seq(path))
        )
    }

  /** Encodes things as records if there is no Injection defined */
  implicit def usingDerivation[F, G <: HList, H <: HList](
      implicit
      i0: LabelledGeneric.Aux[F, G],
      i1: DropUnitValues.Aux[G, H],
      i2: IsHCons[H],
      i3: Lazy[RecordEncoderFields[H]],
      i4: Lazy[NewInstanceExprs[G]],
      i5: ClassTag[F]
    ): TypedEncoder[F] = new RecordEncoder[F, G, H]

  /** Encodes things using a Spark SQL's User Defined Type (UDT) if there is one defined in implicit */
  implicit def usingUserDefinedType[
      A >: Null: UserDefinedType: ClassTag
    ]: TypedEncoder[A] = {
    val udt = implicitly[UserDefinedType[A]]
    val udtInstance =
      NewInstance(udt.getClass, Nil, dataType = ObjectType(udt.getClass))

    new TypedEncoder[A] {
      def nullable: Boolean = false
      def jvmRepr: DataType = ObjectType(udt.userClass)
      def catalystRepr: DataType = udt

      def toCatalyst(path: Expression): Expression =
        Invoke(udtInstance, "serialize", udt, Seq(path))

      def fromCatalyst(path: Expression): Expression =
        Invoke(udtInstance, "deserialize", ObjectType(udt.userClass), Seq(path))
    }
  }

  object injections extends InjectionEnum
}


================================================
FILE: dataset/src/main/scala/frameless/TypedExpressionEncoder.scala
================================================
package frameless

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, If}
import org.apache.spark.sql.types.StructType

object TypedExpressionEncoder {

  /** In Spark, DataFrame has always schema of StructType
    *
    * DataFrames of primitive types become records 
    * with a single field called "value" set in ExpressionEncoder.
    */
  def targetStructType[A](encoder: TypedEncoder[A]): StructType =
   encoder.catalystRepr match {
      case x: StructType =>
        if (encoder.nullable) StructType(x.fields.map(_.copy(nullable = true)))
        else x

      case dt => new StructType().add("value", dt, nullable = encoder.nullable)
    }

  def apply[T](implicit encoder: TypedEncoder[T]): Encoder[T] = {
    val in = BoundReference(0, encoder.jvmRepr, encoder.nullable)

    val (out, serializer) = encoder.toCatalyst(in) match {
      case it @ If(_, _, _: CreateNamedStruct) => {
        val out = GetColumnByOrdinal(0, encoder.catalystRepr)

        out -> it
      }

      case other => {
        val out = GetColumnByOrdinal(0, encoder.catalystRepr)

        out -> other
      }
    }

    new ExpressionEncoder[T](
      objSerializer = serializer,
      objDeserializer = encoder.fromCatalyst(out),
      clsTag = encoder.classTag
    )
  }
}


================================================
FILE: dataset/src/main/scala/frameless/With.scala
================================================
package frameless

/** Compute the intersection of two types:
  *
  * - With[A, A] = A
  * - With[A, B] = A with B (when A != B)
  *
  * This type function is needed to prevent IDEs from infering large types
  * with shape `A with A with ... with A`. These types could be confusing for
  * both end users and IDE's type checkers.
  */
trait With[A, B] { type Out }

object With extends LowPrioWith {
  implicit def combine[A, B]: Aux[A, B, A with B] = of[A, B, A with B]
}

private[frameless] sealed trait LowPrioWith {
  type Aux[A, B, W] = With[A, B] { type Out = W }

  protected[this] val theInstance = new With[Any, Any] {}

  protected[this] def of[A, B, W]: With[A, B] { type Out = W } =
    theInstance.asInstanceOf[Aux[A, B, W]]

  implicit def identity[T]: Aux[T, T, T] = of[T, T, T]
}


================================================
FILE: dataset/src/main/scala/frameless/functions/AggregateFunctions.scala
================================================
package frameless
package functions

import org.apache.spark.sql.FramelessInternals.expr
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.{functions => sparkFunctions}
import frameless.syntax._

import scala.annotation.nowarn

trait AggregateFunctions {
  /** Aggregate function: returns the number of items in a group.
    *
    * apache/spark
    */
  def count[T](): TypedAggregate[T, Long] =
    sparkFunctions.count(sparkFunctions.lit(1)).typedAggregate

  /** Aggregate function: returns the number of items in a group for which the selected column is not null.
    *
    * apache/spark
    */
  def count[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] =
    sparkFunctions.count(column.untyped).typedAggregate

  /** Aggregate function: returns the number of distinct items in a group.
    *
    * apache/spark
    */
  def countDistinct[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] =
    sparkFunctions.countDistinct(column.untyped).typedAggregate

  /** Aggregate function: returns the approximate number of distinct items in a group.
    */
  def approxCountDistinct[T](column: TypedColumn[T, _]): TypedAggregate[T, Long] =
    sparkFunctions.approx_count_distinct(column.untyped).typedAggregate

  /** Aggregate function: returns the approximate number of distinct items in a group.
    *
    * @param rsd maximum estimation error allowed (default = 0.05)
    *
    * apache/spark
    */
  def approxCountDistinct[T](column: TypedColumn[T, _], rsd: Double): TypedAggregate[T, Long] =
    sparkFunctions.approx_count_distinct(column.untyped, rsd).typedAggregate

  /** Aggregate function: returns a list of objects with duplicates.
    *
    * apache/spark
    */
  def collectList[T, A: TypedEncoder](column: TypedColumn[T, A]): TypedAggregate[T, Vector[A]] =
    sparkFunctions.collect_list(column.untyped).typedAggregate

  /** Aggregate function: returns a set of objects with duplicate elements eliminated.
    *
    * apache/spark
    */
  def collectSet[T, A: TypedEncoder](column: TypedColumn[T, A]): TypedAggregate[T, Vector[A]] =
    sparkFunctions.collect_set(column.untyped).typedAggregate

  /** Aggregate function: returns the sum of all values in the given column.
    *
    * apache/spark
    */
  def sum[A, T, Out](column: TypedColumn[T, A])(
    implicit
    summable: CatalystSummable[A, Out],
    oencoder: TypedEncoder[Out],
    aencoder: TypedEncoder[A]
  ): TypedAggregate[T, Out] = {
    val zeroExpr = Literal.create(summable.zero, TypedEncoder[A].catalystRepr)
    val sumExpr = expr(sparkFunctions.sum(column.untyped))
    val sumOrZero = Coalesce(Seq(sumExpr, zeroExpr))

    new TypedAggregate[T, Out](sumOrZero)
  }

  /** Aggregate function: returns the sum of distinct values in the column.
    *
    * apache/spark
    */
  @nowarn // supress sparkFunctions.sumDistinct call which is used to maintain Spark 3.1.x backwards compat
  def sumDistinct[A, T, Out](column: TypedColumn[T, A])(
    implicit
    summable: CatalystSummable[A, Out],
    oencoder: TypedEncoder[Out],
    aencoder: TypedEncoder[A]
  ): TypedAggregate[T, Out] = {
    val zeroExpr = Literal.create(summable.zero, TypedEncoder[A].catalystRepr)
    val sumExpr = expr(sparkFunctions.sumDistinct(column.untyped))
    val sumOrZero = Coalesce(Seq(sumExpr, zeroExpr))

    new TypedAggregate[T, Out](sumOrZero)
  }

  /** Aggregate function: returns the average of the values in a group.
    *
    * apache/spark
    */
  def avg[A, T, Out](column: TypedColumn[T, A])(
    implicit
    averageable: CatalystAverageable[A, Out],
    oencoder: TypedEncoder[Out]
  ): TypedAggregate[T, Out] = {
    new TypedAggregate[T, Out](sparkFunctions.avg(column.untyped))
  }

  /** Aggregate function: returns the unbiased variance of the values in a group.
    *
    * @note In Spark variance always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#186]]
    *
    * apache/spark
    */
  def variance[A: CatalystVariance, T](column: TypedColumn[T, A]): TypedAggregate[T, Double] =
    sparkFunctions.variance(column.untyped).typedAggregate

  /** Aggregate function: returns the sample standard deviation.
    *
    * @note In Spark stddev always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#155]]
    *
    * apache/spark
    */
  def stddev[A: CatalystVariance, T](column: TypedColumn[T, A]): TypedAggregate[T, Double] =
    sparkFunctions.stddev(column.untyped).typedAggregate

  /**
    * Aggregate function: returns the standard deviation of a column by population.
    *
    * @note In Spark stddev always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L143]]
    *
    *       apache/spark
    */
  def stddevPop[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = {
    new TypedAggregate[T, Option[Double]](
      sparkFunctions.stddev_pop(column.cast[Double].untyped)
    )
  }

  /**
    * Aggregate function: returns the standard deviation of a column by sample.
    *
    * @note In Spark stddev always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L160]]
    *
    *       apache/spark
    */
  def stddevSamp[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double] ): TypedAggregate[T, Option[Double]] = {
    new TypedAggregate[T, Option[Double]](
      sparkFunctions.stddev_samp(column.cast[Double].untyped)
    )
  }

  /** Aggregate function: returns the maximum value of the column in a group.
    *
    * apache/spark
    */
  def max[A: CatalystOrdered, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = {
    implicit val c = column.uencoder
    sparkFunctions.max(column.untyped).typedAggregate
  }

  /** Aggregate function: returns the minimum value of the column in a group.
    *
    * apache/spark
    */
  def min[A: CatalystOrdered, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = {
    implicit val c = column.uencoder
    sparkFunctions.min(column.untyped).typedAggregate
  }

  /** Aggregate function: returns the first value in a group.
    *
    * The function by default returns the first values it sees. It will return the first non-null
    * value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
    *
    * apache/spark
    */
  def first[A, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = {
    sparkFunctions.first(column.untyped).typedAggregate(column.uencoder)
  }

  /**
    * Aggregate function: returns the last value in a group.
    *
    * The function by default returns the last values it sees. It will return the last non-null
    * value it sees when ignoreNulls is set to true. If all values are null, then null is returned.
    *
    * apache/spark
    */
  def last[A, T](column: TypedColumn[T, A]): TypedAggregate[T, A] = {
    implicit val c = column.uencoder
    sparkFunctions.last(column.untyped).typedAggregate
  }

  /**
    * Aggregate function: returns the Pearson Correlation Coefficient for two columns.
    *
    * @note In Spark corr always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Corr.scala#L95]]
    *
    *       apache/spark
    */
  def corr[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B])
    (implicit
      i0: CatalystCast[A, Double],
      i1: CatalystCast[B, Double]
    ): TypedAggregate[T, Option[Double]] = {
      new TypedAggregate[T, Option[Double]](
        sparkFunctions.corr(column1.cast[Double].untyped, column2.cast[Double].untyped)
      )
    }

  /**
    * Aggregate function: returns the covariance of two collumns.
    *
    * @note In Spark covar_pop always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala#L82]]
    *
    *       apache/spark
    */
  def covarPop[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B])
    (implicit
      i0: CatalystCast[A, Double],
      i1: CatalystCast[B, Double]
    ): TypedAggregate[T, Option[Double]] = {
      new TypedAggregate[T, Option[Double]](
        sparkFunctions.covar_pop(column1.cast[Double].untyped, column2.cast[Double].untyped)
      )
    }

  /**
    * Aggregate function: returns the covariance of two columns.
    *
    * @note In Spark covar_samp always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Covariance.scala#L93]]
    *
    *       apache/spark
    */
  def covarSamp[A, B, T](column1: TypedColumn[T, A], column2: TypedColumn[T, B])
    (implicit
      i0: CatalystCast[A, Double],
      i1: CatalystCast[B, Double]
    ): TypedAggregate[T, Option[Double]] = {
      new TypedAggregate[T, Option[Double]](
        sparkFunctions.covar_samp(column1.cast[Double].untyped, column2.cast[Double].untyped)
      )
    }


  /**
    * Aggregate function: returns the kurtosis of a column.
    *
    * @note In Spark kurtosis always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L220]]
    *
    *       apache/spark
    */
  def kurtosis[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = {
    new TypedAggregate[T, Option[Double]](
      sparkFunctions.kurtosis(column.cast[Double].untyped)
    )
  }

  /**
    * Aggregate function: returns the skewness of a column.
    *
    * @note In Spark skewness always returns Double
    *       [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala#L200]]
    *
    *       apache/spark
    */
  def skewness[A, T](column: TypedColumn[T, A])(implicit ev: CatalystCast[A, Double]): TypedAggregate[T, Option[Double]] = {
    new TypedAggregate[T, Option[Double]](
      sparkFunctions.skewness(column.cast[Double].untyped)
    )
  }
}


================================================
FILE: dataset/src/main/scala/frameless/functions/Lit.scala
================================================
package frameless.functions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression}
import org.apache.spark.sql.types.DataType

private[frameless] case class Lit[T <: AnyVal](
    dataType: DataType,
    nullable: Boolean,
    show: () => String,
    catalystExpr: Expression // must be a generated Expression from a literal TypedEncoder's toCatalyst function
) extends Expression with NonSQLExpression {
  override def toString: String = s"FramelessLit(${show()})"

  lazy val codegen = {
    val ctx = new CodegenContext()
    val eval = genCode(ctx)

    val codeBody =
      s"""
      public scala.Function1<InternalRow, Object> generate(Object[] references) {
        return new LiteralEvalImpl(references);
      }

      class LiteralEvalImpl extends scala.runtime.AbstractFunction1<InternalRow, Object> {
        private final Object[] references;
        ${ctx.declareMutableStates()}
        ${ctx.declareAddedFunctions()}

        public LiteralEvalImpl(Object[] references) {
          this.references = references;
          ${ctx.initMutableStates()}
        }

        public java.lang.Object apply(java.lang.Object z) {
          InternalRow ${ctx.INPUT_ROW} = (InternalRow) z;
          ${eval.code}
          return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value});
        }
      }
    """

    val code = CodeFormatter.stripOverlappingComments(
      new CodeAndComment(codeBody, ctx.getPlaceHolderToComments())
    )

    val (clazz, _) = CodeGenerator.compile(code)
    val codegen =
      clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef]
    codegen
  }

  def eval(input: InternalRow): Any = codegen(input)
  
  def children: Seq[Expression] = Nil

  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = catalystExpr.genCode(ctx)

  protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = this

  override val foldable: Boolean = catalystExpr.foldable
}


================================================
FILE: dataset/src/main/scala/frameless/functions/NonAggregateFunctions.scala
================================================
package frameless
package functions

import org.apache.spark.sql.{Column, functions => sparkFunctions}

import scala.annotation.nowarn
import scala.util.matching.Regex

trait NonAggregateFunctions {
  /** Non-Aggregate function: calculates the SHA-2 digest of a binary column and returns the value as a 40 character hex string
    *
    * apache/spark
    */
  def sha2[T](column: AbstractTypedColumn[T, Array[Byte]], numBits: Int): column.ThisType[T, String] =
    column.typed(sparkFunctions.sha2(column.untyped, numBits))

  /** Non-Aggregate function: calculates the SHA-1 digest of a binary column and returns the value as a 40 character hex string
    *
    * apache/spark
    */
  def sha1[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, String] =
    column.typed(sparkFunctions.sha1(column.untyped))

  /** Non-Aggregate function: returns a cyclic redundancy check value of a binary column as long.
    *
    * apache/spark
    */
  def crc32[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, Long] =
    column.typed(sparkFunctions.crc32(column.untyped))
  /**
    * Non-Aggregate function: returns the negated value of column.
    *
    * apache/spark
    */
  def negate[A, B, T](column: AbstractTypedColumn[T,A])(
    implicit i0: CatalystNumericWithJavaBigDecimal[A, B],
    i1: TypedEncoder[B]
  ): column.ThisType[T,B] =
    column.typed(sparkFunctions.negate(column.untyped))

  /**
    * Non-Aggregate function: logical not.
    *
    * apache/spark
    */
  def not[T](column: AbstractTypedColumn[T,Boolean]): column.ThisType[T,Boolean] =
    column.typed(sparkFunctions.not(column.untyped))

  /**
    * Non-Aggregate function: Convert a number in a string column from one base to another.
    *
    * apache/spark
    */
  def conv[T](column: AbstractTypedColumn[T,String], fromBase: Int, toBase: Int): column.ThisType[T,String] =
    column.typed(sparkFunctions.conv(column.untyped,fromBase,toBase))

  /** Non-Aggregate function: Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
    *
    * apache/spark
    */
  def degrees[A,T](column: AbstractTypedColumn[T,A]): column.ThisType[T,Double] =
    column.typed(sparkFunctions.degrees(column.untyped))

  /** Non-Aggregate function: returns the ceiling of a numeric column
    *
    * apache/spark
    */
  def ceil[A, B, T](column: AbstractTypedColumn[T, A])
    (implicit
      i0: CatalystRound[A, B],
      i1: TypedEncoder[B]
    ): column.ThisType[T, B] =
      column.typed(sparkFunctions.ceil(column.untyped))(i1)

  /** Non-Aggregate function: returns the floor of a numeric column
    *
    * apache/spark
    */
  def floor[A, B, T](column: AbstractTypedColumn[T, A])
   (implicit
    i0: CatalystRound[A, B],
    i1: TypedEncoder[B]
   ): column.ThisType[T, B] =
    column.typed(sparkFunctions.floor(column.untyped))(i1)

  /** Non-Aggregate function: unsigned shift the the given value numBits right. If given long, will return long else it will return an integer.
    *
    * apache/spark
    */
  @nowarn // supress sparkFunctions.shiftRightUnsigned call which is used to maintain Spark 3.1.x backwards compat
  def shiftRightUnsigned[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int)
    (implicit
      i0: CatalystBitShift[A, B],
      i1: TypedEncoder[B]
    ): column.ThisType[T, B] =
      column.typed(sparkFunctions.shiftRightUnsigned(column.untyped, numBits))

  /** Non-Aggregate function: shift the the given value numBits right. If given long, will return long else it will return an integer.
    *
    * apache/spark
    */
  @nowarn // supress sparkFunctions.shiftReft call which is used to maintain Spark 3.1.x backwards compat
  def shiftRight[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int)
    (implicit
      i0: CatalystBitShift[A, B],
      i1: TypedEncoder[B]
    ): column.ThisType[T, B] =
      column.typed(sparkFunctions.shiftRight(column.untyped, numBits))

  /** Non-Aggregate function: shift the the given value numBits left. If given long, will return long else it will return an integer.
    *
    * apache/spark
    */
  @nowarn // supress sparkFunctions.shiftLeft call which is used to maintain Spark 3.1.x backwards compat
  def shiftLeft[A, B, T](column: AbstractTypedColumn[T, A], numBits: Int)
    (implicit
      i0: CatalystBitShift[A, B],
      i1: TypedEncoder[B]
    ): column.ThisType[T, B] =
    column.typed(sparkFunctions.shiftLeft(column.untyped, numBits))
  
  /** Non-Aggregate function: returns the absolute value of a numeric column
    *
    * apache/spark
    */
  def abs[A, B, T](column: AbstractTypedColumn[T, A])
    (implicit
     i0: CatalystNumericWithJavaBigDecimal[A, B],
     i1: TypedEncoder[B]
    ): column.ThisType[T, B] =
      column.typed(sparkFunctions.abs(column.untyped))(i1)

  /** Non-Aggregate function: Computes the cosine of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def cos[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.cos(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the hyperbolic cosine of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def cosh[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.cosh(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the signum of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def signum[A, T](column: AbstractTypedColumn[T, A])
                  (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
    column.typed(sparkFunctions.signum(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the sine of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def sin[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.sin(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the hyperbolic sine of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def sinh[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.sinh(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the tangent of the given column.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def tan[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.tan(column.cast[Double].untyped))

  /** Non-Aggregate function: Computes the hyperbolic tangent of the given value.
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def tanh[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.tanh(column.cast[Double].untyped))

  /** Non-Aggregate function: returns the acos of a numeric column
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def acos[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.acos(column.cast[Double].untyped))

  /** Non-Aggregate function: returns true if value is contained with in the array in the specified column
    *
    * apache/spark
    */
  def arrayContains[C[_]: CatalystCollection, A, T](column: AbstractTypedColumn[T, C[A]], value: A): column.ThisType[T, Boolean] =
    column.typed(sparkFunctions.array_contains(column.untyped, value))

  /** Non-Aggregate function: returns the atan of a numeric column
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def atan[A, T](column: AbstractTypedColumn[T,A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.atan(column.cast[Double].untyped))

  /** Non-Aggregate function: returns the asin of a numeric column
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def asin[A, T](column: AbstractTypedColumn[T, A])
    (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
      column.typed(sparkFunctions.asin(column.cast[Double].untyped))

  /** Non-Aggregate function: returns the angle theta from the conversion of rectangular coordinates (x, y) to
    * polar coordinates (r, theta).
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def atan2[A, B, T](l: TypedColumn[T, A], r: TypedColumn[T, B])
    (implicit
      i0: CatalystCast[A, Double],
      i1: CatalystCast[B, Double]
    ): TypedColumn[T, Double] =
      r.typed(sparkFunctions.atan2(l.cast[Double].untyped, r.cast[Double].untyped))

  /** Non-Aggregate function: returns the angle theta from the conversion of rectangular coordinates (x, y) to
    * polar coordinates (r, theta).
    *
    * Spark will expect a Double value for this expression. See:
    *   [[https://github.com/apache/spark/blob/4a3c09601ba69f7d49d1946bb6f20f5cfe453031/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala#L67]]
    * apache/spark
    */
  def atan2[A, B, T](l: TypedAggregate[T, A], r: TypedAggregate[T, B])
    (implicit
      i0: CatalystCast[A, Double],
      i1: CatalystCast[B, Double]
    ): TypedAggregate[T, Double] =
      r.typed(sparkFunctions.atan2(l.cast[Double].untyped, r.cast[Double].untyped))

  def atan2[B, T](l: Double, r: TypedColumn[T, B])
    (implicit i0: CatalystCast[B, Double]): TypedColumn[T, Double] =
      atan2(r.lit(l), r)

  def atan2[A, T](l: TypedColumn[T, A], r: Double)
    (implicit i0: CatalystCast[A, Double]): TypedColumn[T, Double] =
      atan2(l, l.lit(r))

  def atan2[B, T](l: Double, r: TypedAggregate[T, B])
    (implicit i0: CatalystCast[B, Double]): TypedAggregate[T, Double] =
      atan2(r.lit(l), r)

  def atan2[A, T](l: TypedAggregate[T, A], r: Double)
    (implicit i0: CatalystCast[A, Double]): TypedAggregate[T, Double] =
      atan2(l, l.lit(r))

  /** Non-Aggregate function: returns the square root value of a numeric column.
    *
    * apache/spark
    */
  def sqrt[A, T](column: AbstractTypedColumn[T, A])
                (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
    column.typed(sparkFunctions.sqrt(column.cast[Double].untyped))

  /** Non-Aggregate function: returns the cubic root value of a numeric column.
    *
    * apache/spark
    */
  def cbrt[A, T](column: AbstractTypedColumn[T, A])
                (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
    column.typed(sparkFunctions.cbrt(column.cast[Double].untyped))

  /** Non-Aggregate function: returns the exponential value of a numeric column.
    *
    * apache/spark
    */
  def exp[A, T](column: AbstractTypedColumn[T, A])
               (implicit i0: CatalystCast[A, Double]): column.ThisType[T, Double] =
    column.typed(sparkFunctions.exp(column.cast[Double].untyped))

  /** Non-Aggregate function: Returns the value of the column `e` rounded to 0 decimal places with HALF_UP round mode.
    *
    * apache/spark
    */
  def round[A, B, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B]
  ): column.ThisType[T, B] =
    column.typed(sparkFunctions.round(column.untyped))(i1)

  /** Non-Aggregate function: Round the value of `e` to `scale` decimal places with HALF_UP round mode
    * if `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0.
    *
    * apache/spark
    */
  def round[A, B, T](column: AbstractTypedColumn[T, A], scale: Int)(
    implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B]
  ): column.ThisType[T, B] =
    column.typed(sparkFunctions.round(column.untyped, scale))(i1)

  /** Non-Aggregate function: Bankers Rounding - returns the rounded to 0 decimal places value with HALF_EVEN round mode
    *  of a numeric column.
    *
    * apache/spark
    */
  def bround[A, B, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B]
  ): column.ThisType[T, B] =
    column.typed(sparkFunctions.bround(column.untyped))(i1)

  /** Non-Aggregate function: Bankers Rounding - returns the rounded to `scale` decimal places value with HALF_EVEN round mode
    *  of a numeric column. If `scale` is greater than or equal to 0 or at integral part when `scale` is less than 0.
    *
    * apache/spark
    */
  def bround[A, B, T](column: AbstractTypedColumn[T, A], scale: Int)(
    implicit i0: CatalystNumericWithJavaBigDecimal[A, B], i1: TypedEncoder[B]
  ): column.ThisType[T, B] =
    column.typed(sparkFunctions.bround(column.untyped, scale))(i1)

  /**
    * Computes the natural logarithm of the given value.
    *
    * apache/spark
    */
  def log[A, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.log(column.untyped))

  /**
    * Returns the first argument-base logarithm of the second argument.
    *
    * apache/spark
    */
  def log[A, T](base: Double, column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.log(base, column.untyped))

  /**
    * Computes the logarithm of the given column in base 2.
    *
    * apache/spark
    */
  def log2[A, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.log2(column.untyped))

  /**
    * Computes the natural logarithm of the given value plus one.
    *
    * apache/spark
    */
  def log1p[A, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.log1p(column.untyped))

  /**
    * Computes the logarithm of the given column in base 10.
    *
    * apache/spark
    */
  def log10[A, T](column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.log10(column.untyped))


  /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
    * apache/spark
    */
  def hypot[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.hypot(column.untyped, column2.untyped))

  /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
    * apache/spark
    */
  def hypot[A, T](column: AbstractTypedColumn[T, A], l: Double)(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.hypot(column.untyped, l))

  /**
    * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow.
    *
    * apache/spark
    */
  def hypot[A, T](l: Double, column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.hypot(l, column.untyped))

  /**
    * Returns the value of the first argument raised to the power of the second argument.
    *
    * apache/spark
    */
  def pow[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.pow(column.untyped, column2.untyped))

  /**
    * Returns the value of the first argument raised to the power of the second argument.
    *
    * apache/spark
    */
  def pow[A, T](column: AbstractTypedColumn[T, A], l: Double)(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.pow(column.untyped, l))

  /**
    * Returns the value of the first argument raised to the power of the second argument.
    *
    * apache/spark
    */
  def pow[A, T](l: Double, column: AbstractTypedColumn[T, A])(
    implicit i0: CatalystCast[A, Double]
  ): column.ThisType[T, Double] =
    column.typed(sparkFunctions.pow(l, column.untyped))

  /**
    * Returns the positive value of dividend mod divisor.
    *
    * apache/spark
    */
  def pmod[A, T](column: AbstractTypedColumn[T, A], column2: AbstractTypedColumn[T, A])(
    implicit i0: TypedEncoder[A]
  ): column.ThisType[T, A] =
    column.typed(sparkFunctions.pmod(column.untyped, column2.untyped))


  /** Non-Aggregate function: Returns the string representation of the binary value of the given long
    * column. For example, bin("12") returns "1100".
    *
    * apache/spark
    */
  def bin[T](column: AbstractTypedColumn[T, Long]): column.ThisType[T, String] =
    column.typed(sparkFunctions.bin(column.untyped))

  /**
    * Calculates the MD5 digest of a binary column and returns the value
    * as a 32 character hex string.
    *
    * apache/spark
    */
  def md5[T, A](column: AbstractTypedColumn[T, A])(implicit  i0: TypedEncoder[A]): column.ThisType[T, String] =
    column.typed(sparkFunctions.md5(column.untyped))

  /**
    * Computes the factorial of the given value.
    *
    * apache/spark
    */
  def factorial[T](column: AbstractTypedColumn[T, Long])(implicit  i0: TypedEncoder[Long]): column.ThisType[T, Long] =
    column.typed(sparkFunctions.factorial(column.untyped))

  /** Non-Aggregate function: Computes bitwise NOT.
    *
    * apache/spark
    */
  @nowarn // supress sparkFunctions.bitwiseNOT call which is used to maintain Spark 3.1.x backwards compat
  def bitwiseNOT[A: CatalystBitwise, T](column: AbstractTypedColumn[T, A]): column.ThisType[T, A] =
    column.typed(sparkFunctions.bitwiseNOT(column.untyped))(column.uencoder)

  /** Non-Aggregate function: file name of the current Spark task. Empty string if row did not originate from
    * a file
    *
    * apache/spark
    */
  def inputFileName[T](): TypedColumn[T, String] =
    new TypedColumn[T, String](sparkFunctions.input_file_name())

  /** Non-Aggregate function: generates monotonically increasing id
    *
    * apache/spark
    */
  def monotonicallyIncreasingId[T](): TypedColumn[T, Long] = {
    new TypedColumn[T, Long](sparkFunctions.monotonically_increasing_id())
  }

  /** Non-Aggregate function: Evaluates a list of conditions and returns one of multiple
    * possible result expressions. If none match, otherwise is returned
    * {{{
    *   when(ds('boolField), ds('a))
    *     .when(ds('otherBoolField), lit(123))
    *     .otherwise(ds('b))
    * }}}
    * apache/spark
    */
  def when[T, A](condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]): When[T, A] =
    new When[T, A](condition, value)

  class When[T, A] private (untypedC: Column) {
    private[functions] def this(condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]) =
      this(sparkFunctions.when(condition.untyped, value.untyped))

    def when(condition: AbstractTypedColumn[T, Boolean], value: AbstractTypedColumn[T, A]): When[T, A] =
      new When[T, A](untypedC.when(condition.untyped, value.untyped))

    def otherwise(value: AbstractTypedColumn[T, A]): value.ThisType[T, A] =
      value.typed(untypedC.otherwise(value.untyped))(value.uencoder)
  }

  //////////////////////////////////////////////////////////////////////////////////////////////
  // String functions
  //////////////////////////////////////////////////////////////////////////////////////////////


  /** Non-Aggregate function: takes the first letter of a string column and returns the ascii int value in a new column
    *
    * apache/spark
    */
  def ascii[T](column: AbstractTypedColumn[T, String]): column.ThisType[T, Int] =
    column.typed(sparkFunctions.ascii(column.untyped))

  /** Non-Aggregate function: Computes the BASE64 encoding of a binary column and returns it as a string column.
    * This is the reverse of unbase64.
    *
    * apache/spark
    */
  def base64[T](column: AbstractTypedColumn[T, Array[Byte]]): column.ThisType[T, String] =
    column.typed(sparkFunctions.base64(column.untyped))

  /** Non-Aggregate function: Decodes a BASE64 encoded string column and returns it as a binary column.
    * This is the reverse of base64.
    *
    * apache/spark
    */
  def unbase64[T](column: AbstractTypedColumn[T, String]): column.ThisType[T, Array[Byte]] =
    column.typed(sparkFunctions.unbase64(column.untyped))

  /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column.
    * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]]
    *
    * apache/spark
    */
  def concat[T](columns: TypedColumn[T, String]*): TypedColumn[T, String] =
    new TypedColumn(sparkFunctions.concat(columns.map(_.untyped): _*))

  /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column.
    * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]]
    *
    * apache/spark
    */
  def concat[T](columns: TypedAggregate[T, String]*): TypedAggregate[T, String] =
    new TypedAggregate(sparkFunctions.concat(columns.map(_.untyped): _*))

  /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column,
    * using the given separator.
    * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]]
    *
    * apache/spark
    */
  def concatWs[T](sep: String, columns: TypedAggregate[T, String]*): TypedAggregate[T, String] =
    new TypedAggregate(sparkFunctions.concat_ws(sep, columns.map(_.untyped): _*))

  /** Non-Aggregate function: Concatenates multiple input string columns together into a single string column,
    * using the given separator.
    * @note varargs make it harder to generalize so we overload the method for [[TypedColumn]] and [[TypedAggregate]]
    *
    * apache/spark
    */
  def concatWs[T](sep: String, columns: TypedColumn[T, String]*): TypedColumn[T, String] =
    new TypedColumn(sparkFunctions.concat_ws(sep, columns.map(_.untyped): _*))

  /** Non-Aggregate function: Locates the position of the first occurrence of substring column
    * in given string
    *
    * @note The position is not zero based, but 1 based index. Returns 0 if substr
    * could not be found in str.
    *
    * apache/spark
    */
  def instr[T](str: AbstractTypedColumn[T, String], substring: String): str.ThisType[T, Int] =
    str.typed(sparkFunctions.instr(str.untyped, substring))

  /** Non-Aggregate function: Computes the length of a given string.
    *
    * apache/spark
    */
  //TODO: Also for binary
  def length[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Int] =
    str.typed(sparkFunctions.length(str.untyped))

  /** Non-Aggregate function: Computes the Levenshtein distance of the two given string columns.
    *
    * apache/spark
    */
  def levenshtein[T](l: TypedColumn[T, String], r: TypedColumn[T, String]): TypedColumn[T, Int] =
    l.typed(sparkFunctions.levenshtein(l.untyped, r.untyped))

  /** Non-Aggregate function: Computes the Levenshtein distance of the two given string columns.
    *
    * apache/spark
    */
  def levenshtein[T](l: TypedAggregate[T, String], r: TypedAggregate[T, String]): TypedAggregate[T, Int] =
    l.typed(sparkFunctions.levenshtein(l.untyped, r.untyped))

  /** Non-Aggregate function: Converts a string column to lower case.
    *
    * apache/spark
    */
  def lower[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.lower(str.untyped))

  /** Non-Aggregate function: Left-pad the string column with pad to a length of len. If the string column is longer
    * than len, the return value is shortened to len characters.
    *
    * apache/spark
    */
  def lpad[T](str: AbstractTypedColumn[T, String],
              len: Int,
              pad: String): str.ThisType[T, String] =
    str.typed(sparkFunctions.lpad(str.untyped, len, pad))

  /** Non-Aggregate function: Trim the spaces from left end for the specified string value.
    *
    * apache/spark
    */
  def ltrim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.ltrim(str.untyped))

  /** Non-Aggregate function: Replace all substrings of the specified string value that match regexp with rep.
    *
    * apache/spark
    */
  def regexpReplace[T](str: AbstractTypedColumn[T, String],
                       pattern: Regex,
                       replacement: String): str.ThisType[T, String] =
    str.typed(sparkFunctions.regexp_replace(str.untyped, pattern.regex, replacement))


  /** Non-Aggregate function: Reverses the string column and returns it as a new string column.
    *
    * apache/spark
    */
  def reverse[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.reverse(str.untyped))

  /** Non-Aggregate function: Right-pad the string column with pad to a length of len.
    * If the string column is longer than len, the return value is shortened to len characters.
    *
    * apache/spark
    */
  def rpad[T](str: AbstractTypedColumn[T, String], len: Int, pad: String): str.ThisType[T, String] =
    str.typed(sparkFunctions.rpad(str.untyped, len, pad))

  /** Non-Aggregate function: Trim the spaces from right end for the specified string value.
    *
    * apache/spark
    */
  def rtrim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.rtrim(str.untyped))

  /** Non-Aggregate function: Substring starts at `pos` and is of length `len`
    *
    * apache/spark
    */
  //TODO: Also for byte array
  def substring[T](str: AbstractTypedColumn[T, String], pos: Int, len: Int): str.ThisType[T, String] =
    str.typed(sparkFunctions.substring(str.untyped, pos, len))

  /** Non-Aggregate function: Trim the spaces from both ends for the specified string column.
    *
    * apache/spark
    */
  def trim[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.trim(str.untyped))

  /** Non-Aggregate function: Converts a string column to upper case.
    *
    * apache/spark
    */
  def upper[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, String] =
    str.typed(sparkFunctions.upper(str.untyped))

  //////////////////////////////////////////////////////////////////////////////////////////////
  // DateTime functions
  //////////////////////////////////////////////////////////////////////////////////////////////

  /** Non-Aggregate function: Extracts the year as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#year` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def year[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.year(str.untyped))

  /** Non-Aggregate function: Extracts the quarter as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#quarter` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def quarter[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.quarter(str.untyped))

  /** Non-Aggregate function Extracts the month as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#month` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def month[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.month(str.untyped))

  /** Non-Aggregate function: Extracts the day of the week as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#dayofweek` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def dayofweek[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.dayofweek(str.untyped))

  /** Non-Aggregate function: Extracts the day of the month as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#dayofmonth` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def dayofmonth[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.dayofmonth(str.untyped))

  /** Non-Aggregate function: Extracts the day of the year as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#dayofyear` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def dayofyear[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.dayofyear(str.untyped))

  /** Non-Aggregate function: Extracts the hours as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#hour` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def hour[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.hour(str.untyped))

  /** Non-Aggregate function: Extracts the minutes as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#minute` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def minute[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.minute(str.untyped))

  /** Non-Aggregate function: Extracts the seconds as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#second` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def second[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.second(str.untyped))

  /** Non-Aggregate function: Extracts the week number as an integer from a given date/timestamp/string.
    *
    * Differs from `Column#weekofyear` by wrapping it's result into an `Option`.
    *
    * apache/spark
    */
  def weekofyear[T](str: AbstractTypedColumn[T, String]): str.ThisType[T, Option[Int]] =
    str.typed(sparkFunctions.weekofyear(str.untyped))
}


================================================
FILE: dataset/src/main/scala/frameless/functions/Udf.scala
================================================
package frameless
package functions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression, NonSQLExpression}
import org.apache.spark.sql.catalyst.expressions.codegen._
import Block._
import org.apache.spark.sql.types.DataType
import shapeless.syntax.std.tuple._

/** Documentation marked "apache/spark" is thanks to apache/spark Contributors
  * at https://github.com/apache/spark, licensed under Apache v2.0 available at
  * http://www.apache.org/licenses/LICENSE-2.0
  */
trait Udf {

  /** Defines a user-defined function of 1 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
    *
    * apache/spark
    */
  def udf[T, A, R: TypedEncoder](f: A => R):
    TypedColumn[T, A] => TypedColumn[T, R] = {
    u =>
      val scalaUdf = FramelessUdf(f, List(u), TypedEncoder[R])
      new TypedColumn[T, R](scalaUdf)
  }

  /** Defines a user-defined function of 2 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
    *
    * apache/spark
    */
  def udf[T, A1, A2, R: TypedEncoder](f: (A1,A2) => R):
    (TypedColumn[T, A1], TypedColumn[T, A2]) => TypedColumn[T, R] = {
    case us =>
      val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R])
      new TypedColumn[T, R](scalaUdf)
    }

  /** Defines a user-defined function of 3 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
    *
    * apache/spark
    */
  def udf[T, A1, A2, A3, R: TypedEncoder](f: (A1,A2,A3) => R):
  (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3]) => TypedColumn[T, R] = {
    case us =>
      val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R])
      new TypedColumn[T, R](scalaUdf)
    }

  /** Defines a user-defined function of 4 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
    *
    * apache/spark
    */
  def udf[T, A1, A2, A3, A4, R: TypedEncoder](f: (A1,A2,A3,A4) => R):
    (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4]) => TypedColumn[T, R] = {
    case us =>
      val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R])
      new TypedColumn[T, R](scalaUdf)
    }

  /** Defines a user-defined function of 5 arguments as user-defined function (UDF).
    * The data types are automatically inferred based on the function's signature.
    *
    * apache/spark
    */
  def udf[T, A1, A2, A3, A4, A5, R: TypedEncoder](f: (A1,A2,A3,A4,A5) => R):
    (TypedColumn[T, A1], TypedColumn[T, A2], TypedColumn[T, A3], TypedColumn[T, A4], TypedColumn[T, A5]) => TypedColumn[T, R] = {
    case us =>
      val scalaUdf = FramelessUdf(f, us.toList[UntypedExpression[T]], TypedEncoder[R])
      new TypedColumn[T, R](scalaUdf)
    }
}

/**
  * NB: Implementation detail, isn't intended to be directly used.
  *
  * Our own implementation of `ScalaUDF` from Catalyst compatible with [[TypedEncoder]].
  */
case class FramelessUdf[T, R](
  function: AnyRef,
  encoders: Seq[TypedEncoder[_]],
  children: Seq[Expression],
  rencoder: TypedEncoder[R]
) extends Expression with NonSQLExpression {

  override def nullable: Boolean = rencoder.nullable
  override def toString: String = s"FramelessUdf(${children.mkString(", ")})"

  lazy val evalCode = {
    val ctx = new CodegenContext()
    val eval = genCode(ctx)

    val codeBody = s"""
      public scala.Function1<InternalRow, Object> generate(Object[] references) {
        return new FramelessUdfEvalImpl(references);
      }

      class FramelessUdfEvalImpl extends scala.runtime.AbstractFunction1<InternalRow, Object> {
        private final Object[] references;
        ${ctx.declareMutableStates()}
        ${ctx.declareAddedFunctions()}

        public FramelessUdfEvalImpl(Object[] references) {
          this.references = references;
          ${ctx.initMutableStates()}
        }

        public java.lang.Object apply(java.lang.Object z) {
          InternalRow ${ctx.INPUT_ROW} = (InternalRow) z;
          ${eval.code}
          return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value});
        }
      }
    """

    val code = CodeFormatter.stripOverlappingComments(
      new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))

    val (clazz, _) = CodeGenerator.compile(code)
    val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef]

    codegen
  }

  def eval(input: InternalRow): Any = {
    evalCode(input)
  }

  def dataType: DataType = rencoder.catalystRepr

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    ctx.references += this

    // save reference to `function` field from `FramelessUdf` to call it later
    val framelessUdfClassName = classOf[FramelessUdf[_, _]].getName
    val funcClassName = s"scala.Function${children.size}"
    val funcExpressionIdx = ctx.references.size - 1
    val funcTerm = ctx.addMutableState(funcClassName, ctx.freshName("udf"),
      v => s"$v = ($funcClassName)((($framelessUdfClassName)references" +
        s"[$funcExpressionIdx]).function());")

    val (argsCode, funcArguments) = encoders.zip(children).map {
      case (encoder, child) =>
        val eval = child.genCode(ctx)
        val codeTpe = CodeGenerator.boxedType(encoder.jvmRepr)
        val argTerm = ctx.freshName("arg")
        val convert = s"${eval.code}\n$codeTpe $argTerm = ${eval.isNull} ? (($codeTpe)null) : (($codeTpe)(${eval.value}));"

        (convert, argTerm)
    }.unzip

    val internalTpe = CodeGenerator.boxedType(rencoder.jvmRepr)
    val internalTerm = ctx.addMutableState(internalTpe, ctx.freshName("internal"))
    val internalNullTerm = ctx.addMutableState("boolean", ctx.freshName("internalNull"))
    // CTw - can't inject the term, may have to duplicate old code for parity
    val internalExpr = Spark2_4_LambdaVariable(internalTerm, internalNullTerm, rencoder.jvmRepr, true)

    val resultEval = rencoder.toCatalyst(internalExpr).genCode(ctx)

    ev.copy(code = code"""
      ${argsCode.mkString("\n")}

      $internalTerm =
        ($internalTpe)$funcTerm.apply(${funcArguments.mkString(", ")});
      $internalNullTerm = $internalTerm == null;

      ${resultEval.code}
      """,
      value = resultEval.value,
      isNull = resultEval.isNull
    )
  }

  protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(children = newChildren)
}

case class Spark2_4_LambdaVariable(
                           value: String,
                           isNull: String,
                           dataType: DataType,
                           nullable: Boolean = true) extends LeafExpression with NonSQLExpression {

  private val accessor: (InternalRow, Int) => Any = InternalRow.getAccessor(dataType)

  // Interpreted execution of `LambdaVariable` always get the 0-index element from input row.
  override def eval(input: InternalRow): Any = {
    assert(input.numFields == 1,
      "The input row of interpreted LambdaVariable should have only 1 field.")
    if (nullable && input.isNullAt(0)) {
      null
    } else {
      accessor(input, 0)
    }
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val isNullValue = if (nullable) {
      JavaCode.isNullVariable(isNull)
    } else {
      FalseLiteral
    }
    ExprCode(value = JavaCode.variable(value, dataType), isNull = isNullValue)
  }
}

object FramelessUdf {
  // Spark needs case class with `children` field to mutate it
  def apply[T, R](
    function: AnyRef,
    cols: Seq[UntypedExpression[T]],
    rencoder: TypedEncoder[R]
  ): FramelessUdf[T, R] = FramelessUdf(
    function = function,
    encoders = cols.map(_.uencoder).toList,
    children = cols.map(x => x.uencoder.fromCatalyst(x.expr)).toList,
    rencoder = rencoder
  )
}


================================================
FILE: dataset/src/main/scala/frameless/functions/UnaryFunctions.scala
================================================
package frameless
package functions

import org.apache.spark.sql.{Column, functions => sparkFunctions}

import scala.math.Ordering

trait UnaryFunctions {
  /** Returns length of array
    *
    * apache/spark
    */
  def size[T, A, V[_] : CatalystSizableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, Int] =
    new TypedColumn[T, Int](implicitly[CatalystSizableCollection[V]].sizeOp(column.untyped))

  /** Returns length of Map
    *
    * apache/spark
    */
  def size[T, A, B](column: TypedColumn[T, Map[A, B]]): TypedColumn[T, Int] =
    new TypedColumn[T, Int](sparkFunctions.size(column.untyped))

  /** Sorts the input array for the given column in ascending order, according to
    * the natural ordering of the array elements.
    *
    * apache/spark
    */
  def sortAscending[T, A: Ordering, V[_] : CatalystSortableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, V[A]] =
    new TypedColumn[T, V[A]](implicitly[CatalystSortableCollection[V]].sortOp(column.untyped, sortAscending = true))(column.uencoder)

  /** Sorts the input array for the given column in descending order, according to
    * the natural ordering of the array elements.
    *
    * apache/spark
    */
  def sortDescending[T, A: Ordering, V[_] : CatalystSortableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, V[A]] =
    new TypedColumn[T, V[A]](implicitly[CatalystSortableCollection[V]].sortOp(column.untyped, sortAscending = false))(column.uencoder)


  /** Creates a new row for each element in the given collection. The column types
    * eligible for this operation are constrained by CatalystExplodableCollection.
    *
    * apache/spark
    */
  @deprecated("Use explode() from the TypedDataset instead. This method will result in " +
    "runtime error if applied to two columns in the same select statement.", "0.6.2")
  def explode[T, A: TypedEncoder, V[_] : CatalystExplodableCollection](column: TypedColumn[T, V[A]]): TypedColumn[T, A] =
    new TypedColumn[T, A](sparkFunctions.explode(column.untyped))
}

trait CatalystSizableCollection[V[_]] {
  def sizeOp(col: Column): Column
}

object CatalystSizableCollection {
  implicit def sizableVector: CatalystSizableCollection[Vector] = new CatalystSizableCollection[Vector] {
    def sizeOp(col: Column): Column = sparkFunctions.size(col)
  }

  implicit def sizableArray: CatalystSizableCollection[Array] = new CatalystSizableCollection[Array] {
    def sizeOp(col: Column): Column = sparkFunctions.size(col)
  }

  implicit def sizableList: CatalystSizableCollection[List] = new CatalystSizableCollection[List] {
    def sizeOp(col: Column): Column = sparkFunctions.size(col)
  }

}

trait CatalystExplodableCollection[V[_]]

object CatalystExplodableCollection {
  implicit def explodableVector: CatalystExplodableCollection[Vector] = new CatalystExplodableCollection[Vector] {}
  implicit def explodableArray: CatalystExplodableCollection[Array] = new CatalystExplodableCollection[Array] {}
  implicit def explodableList: CatalystExplodableCollection[List] = new CatalystExplodableCollection[List] {}
  implicit def explodableSeq: CatalystExplodableCollection[Seq] = new CatalystExplodableCollection[Seq] {}
}

trait CatalystSortableCollection[V[_]] {
  def sortOp(col: Column, sortAscending: Boolean): Column
}

object CatalystSortableCollection {
  implicit def sortableVector: CatalystSortableCollection[Vector] = new CatalystSortableCollection[Vector] {
    def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending)
  }

  implicit def sortableArray: CatalystSortableCollection[Array] = new CatalystSortableCollection[Array] {
    def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending)
  }

  implicit def sortableList: CatalystSortableCollection[List] = new CatalystSortableCollection[List] {
    def sortOp(col: Column, sortAscending: Boolean): Column = sparkFunctions.sort_array(col, sortAscending)
  }
}


================================================
FILE: dataset/src/main/scala/frameless/functions/package.scala
================================================
package frameless

import scala.reflect.ClassTag

import shapeless._
import shapeless.labelled.FieldType
import shapeless.ops.hlist.IsHCons
import shapeless.ops.record.{ Keys, Values }

import org.apache.spark.sql.{ reflection => ScalaReflection }
import org.apache.spark.sql.catalyst.expressions.Literal

package object functions extends Udf with UnaryFunctions {

  object aggregate extends AggregateFunctions
  object nonAggregate extends NonAggregateFunctions

  /**
   * Creates a [[frameless.TypedAggregate]] of literal value. If A is to be encoded using an Injection make
   * sure the injection instance is in scope.
   *
   * apache/spark
   */
  def litAggr[A, T](
      value: A
    )(implicit
      i0: TypedEncoder[A],
      i1: Refute[IsValueClass[A]]
    ): TypedAggregate[T, A] =
    new TypedAggregate[T, A](lit(value).expr)

  /**
   * Creates a [[frameless.TypedColumn]] of literal value. If A is to be encoded using an Injection make
   * sure the injection instance is in scope.
   *
   * apache/spark
   *
   * @tparam A the literal value type
   * @tparam T the row type
   */
  def lit[A, T](
      value: A
    )(implicit
      encoder: TypedEncoder[A]
    ): TypedColumn[T, A] = {

    if (
      ScalaReflection.isNativeType(
        encoder.jvmRepr
      ) && encoder.catalystRepr == encoder.jvmRepr
    ) {
      val expr = Literal(value, encoder.catalystRepr)

      new TypedColumn(expr)
    } else {
      val expr = new Literal(value, encoder.jvmRepr)

      new TypedColumn[T, A](
        Lit(
          dataType = encoder.catalystRepr,
          nullable = encoder.nullable,
          show = () => value.toString,
          catalystExpr = encoder.toCatalyst(expr)
        )
      )
    }
  }

  /**
   * Creates a [[frameless.TypedColumn]] of literal value
   * for a Value class `A`.
   *
   * @tparam A the value class
   * @tparam T the row type
   */
  def litValue[
      A: IsValueClass,
      T,
      G <: ::[_, HNil],
      H <: ::[_ <: FieldType[_ <: Symbol, _], HNil],
      K <: Symbol,
      V,
      KS <: ::[_ <: Symbol, HNil],
      VS <: HList
    ](value: A
    )(implicit
      i0: LabelledGeneric.Aux[A, G],
      i1: DropUnitValues.Aux[G, H],
      i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
      i3: Keys.Aux[H, KS],
      i4: Values.Aux[H, VS],
      i5: IsHCons.Aux[KS, K, HNil],
      i6: IsHCons.Aux[VS, V, HNil],
      i7: TypedEncoder[V],
      i8: ClassTag[A]
    ): TypedColumn[T, A] = {
    val expr = {
      val field: H = i1(i0.to(value))
      val v: V = i6.head(i4(field))

      new Literal(v, i7.jvmRepr)
    }

    implicit val enc: TypedEncoder[A] =
      RecordFieldEncoder.valueClass[A, G, H, K, V, KS].encoder

    new TypedColumn[T, A](
      Lit(
        dataType = i7.catalystRepr,
        nullable = i7.nullable,
        show = () => value.toString,
        i7.toCatalyst(expr)
      )
    )
  }

  /**
   * Creates a [[frameless.TypedColumn]] of literal value
   * for an optional Value class `A`.
   *
   * @tparam A the value class
   * @tparam T the row type
   */
  def litValue[
      A: IsValueClass,
      T,
      G <: ::[_, HNil],
      H <: ::[_ <: FieldType[_ <: Symbol, _], HNil],
      K <: Symbol,
      V,
      KS <: ::[_ <: Symbol, HNil],
      VS <: HList
    ](value: Option[A]
    )(implicit
      i0: LabelledGeneric.Aux[A, G],
      i1: DropUnitValues.Aux[G, H],
      i2: IsHCons.Aux[H, _ <: FieldType[K, V], HNil],
      i3: Keys.Aux[H, KS],
      i4: Values.Aux[H, VS],
      i5: IsHCons.Aux[KS, K, HNil],
      i6: IsHCons.Aux[VS, V, HNil],
      i7: TypedEncoder[V],
      i8: ClassTag[A]
    ): TypedColumn[T, Option[A]] = {
    val expr = value match {
      case Some(some) => {
        val field: H = i1(i0.to(some))
        val v: V = i6.head(i4(field))

        new Literal(v, i7.jvmRepr)
      }

      case _ =>
        Literal.create(null, i7.jvmRepr)
    }

    implicit val enc: TypedEncoder[A] =
      RecordFieldEncoder.valueClass[A, G, H, K, V, KS].encoder

    new TypedColumn[T, Option[A]](
      Lit(
        dataType = i7.catalystRepr,
        nullable = true,
        show = () => value.toString,
        i7.toCatalyst(expr)
      )
    )
  }
}


================================================
FILE: dataset/src/main/scala/frameless/ops/AggregateTypes.scala
================================================
package frameless
package ops

import shapeless._

/** A type class to extract the column types out of an HList of [[frameless.TypedAggregate]].
  *
  * @note This type class is mostly a workaround to issue with slow implicit derivation for Comapped.
  * @example
  * {{{
  *   type U = TypedAggregate[T,A] :: TypedAggregate[T,B] :: TypedAggregate[T,C] :: HNil
  *   type Out = A :: B :: C :: HNil
  * }}}
  */
trait AggregateTypes[V, U <: HList] {
  type Out <: HList
}

object AggregateTypes {
  type Aux[V, U <: HList, Out0 <: HList] = AggregateTypes[V, U] {type Out = Out0}

  implicit def deriveHNil[T]: AggregateTypes.Aux[T, HNil, HNil] = new AggregateTypes[T, HNil] { type Out = HNil }

  implicit def deriveCons1[T, H, TT <: HList, V <: HList](
    implicit tail: AggregateTypes.Aux[T, TT, V]
  ): AggregateTypes.Aux[T, TypedAggregate[T, H] :: TT, H :: V] =
    new AggregateTypes[T, TypedAggregate[T, H] :: TT] {type Out = H :: V}
}


================================================
FILE: dataset/src/main/scala/frameless/ops/As.scala
================================================
package frameless
package ops

import shapeless.{::, Generic, HList, Lazy}

/** Evidence for correctness of `TypedDataset[T].as[U]` */
class As[T, U] private (implicit val encoder: TypedEncoder[U])

object As extends LowPriorityAs {

  final class Equiv[A, B] private[ops] ()

  implicit def equivIdentity[A] = new Equiv[A, A]

  implicit def deriveAs[A, B]
    (implicit
      i0: TypedEncoder[B],
      i1: Equiv[A, B]
    ): As[A, B] = new As[A, B]

}

trait LowPriorityAs {

  import As.Equiv

  implicit def equivHList[AH, AT <: HList, BH, BT <: HList]
    (implicit
      i0: Lazy[Equiv[AH, BH]],
      i1: Equiv[AT, BT]
    ): Equiv[AH :: AT, BH :: BT] = new Equiv[AH :: AT, BH :: BT]

  implicit def equivGeneric[A, B, R, S]
    (implicit
      i0: Generic.Aux[A, R],
      i1: Generic.Aux[B, S],
      i2: Lazy[Equiv[R, S]]
    ): Equiv[A, B] = new Equiv[A, B]

}


================================================
FILE: dataset/src/main/scala/frameless/ops/ColumnTypes.scala
================================================
package frameless
package ops

import shapeless._

/** A type class to extract the column types out of an HList of [[frameless.TypedColumn]].
  *
  * @note This type class is mostly a workaround to issue with slow implicit derivation for Comapped.
  * @example
  * {{{
  *   type U = TypedColumn[T,A] :: TypedColumn[T,B] :: TypedColumn[T,C] :: HNil
  *   type Out = A :: B :: C :: HNil
  * }}}
  */
trait ColumnTypes[T, U <: HList] {
  type Out <: HList
}

object ColumnTypes {
  type Aux[T, U <: HList, Out0 <: HList] = ColumnTypes[T, U] {type Out = Out0}

  implicit def deriveHNil[T]: ColumnTypes.Aux[T, HNil, HNil] = new ColumnTypes[T, HNil] { type Out = HNil }

  implicit def deriveCons[T, H, TT <: HList, V <: HList](
    implicit tail: ColumnTypes.Aux[T, TT, V]
  ): ColumnTypes.Aux[T, TypedColumn[T, H] :: TT, H :: V] =
    new ColumnTypes[T, TypedColumn[T, H] :: TT] {type Out = H :: V}
}


================================================
FILE: dataset/src/main/scala/frameless/ops/GroupByOps.scala
================================================
package frameless
package ops

import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.{Column, Dataset, FramelessInternals, RelationalGroupedDataset}
import shapeless._
import shapeless.ops.hlist.{Length, Mapped, Prepend, ToList, ToTraversable, Tupler}

class GroupedByManyOps[T, TK <: HList, K <: HList, KT]
  (self: TypedDataset[T], groupedBy: TK)
  (implicit
    i0: ColumnTypes.Aux[T, TK, K],
    i1: ToTraversable.Aux[TK, List, UntypedExpression[T]],
    i3: Tupler.Aux[K, KT]
  ) extends AggregatingOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.groupBy(cols: _*)) {
  object agg extends ProductArgs {
    def applyProduct[TC <: HList, C <: HList, Out0 <: HList, Out1]
      (columns: TC)
      (implicit
        i3: AggregateTypes.Aux[T, TC, C],
        i4: Prepend.Aux[K, C, Out0],
        i5: Tupler.Aux[Out0, Out1],
        i6: TypedEncoder[Out1],
        i7: ToTraversable.Aux[TC, List, UntypedExpression[T]]
      ): TypedDataset[Out1] = {
        aggregate[TC, Out1](columns)
      }
  }
}

class GroupedBy1Ops[K1, V](
  self: TypedDataset[V],
  g1: TypedColumn[V, K1]
) {
  private def underlying = new GroupedByManyOps(self, g1 :: HNil)
  private implicit def eg1 = g1.uencoder

  def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(K1, U1)] = {
    implicit val e1 = c1.uencoder
    underlying.agg(c1)
  }

  def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, U1, U2)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder
    underlying.agg(c1, c2)
  }

  def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, U1, U2, U3)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder
    underlying.agg(c1, c2, c3)
  }

  def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, U1, U2, U3, U4)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder
    underlying.agg(c1, c2, c3, c4)
  }

  def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, U1, U2, U3, U4, U5)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder
    underlying.agg(c1, c2, c3, c4, c5)
  }

  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    */
  object deserialized {
    def mapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => U): TypedDataset[U] = {
      underlying.deserialized.mapGroups(AggregatingOps.tuple1(f))
    }

    def flatMapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = {
      underlying.deserialized.flatMapGroups(AggregatingOps.tuple1(f))
    }
  }

  def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: HNil, P] =
    PivotNotValues(self, g1 :: HNil, pivotColumn)
}


class GroupedBy2Ops[K1, K2, V](
  self: TypedDataset[V],
  g1: TypedColumn[V, K1],
  g2: TypedColumn[V, K2]
) {
  private def underlying = new GroupedByManyOps(self, g1 :: g2 :: HNil)
  private implicit def eg1 = g1.uencoder
  private implicit def eg2 = g2.uencoder

  def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(K1, K2, U1)] = {
    implicit val e1 = c1.uencoder
    underlying.agg(c1)
  }

  def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(K1, K2, U1, U2)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder
    underlying.agg(c1, c2)
  }

  def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(K1, K2, U1, U2, U3)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder
    underlying.agg(c1, c2, c3)
  }

  def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(K1, K2, U1, U2, U3, U4)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder
    underlying.agg(c1 , c2 , c3 , c4)
  }

  def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(K1, K2, U1, U2, U3, U4, U5)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder
    underlying.agg(c1, c2, c3, c4, c5)
  }


  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    */
  object deserialized {
    def mapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => U): TypedDataset[U] = {
      underlying.deserialized.mapGroups(f)
    }

    def flatMapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = {
      underlying.deserialized.flatMapGroups(f)
    }
  }

  def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]):
    PivotNotValues[V, TypedColumn[V,K1] :: TypedColumn[V, K2] :: HNil, P] =
      PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn)
}

private[ops] abstract class AggregatingOps[T, TK <: HList, K <: HList, KT]
  (self: TypedDataset[T], groupedBy: TK, groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset)
  (implicit
    i0: ColumnTypes.Aux[T, TK, K],
    i1: ToTraversable.Aux[TK, List, UntypedExpression[T]],
    i2: Tupler.Aux[K, KT]
  ) {
  def aggregate[TC <: HList, Out1](columns: TC)
  (implicit
    i7: TypedEncoder[Out1],
    i8: ToTraversable.Aux[TC, List, UntypedExpression[T]]
  ): TypedDataset[Out1] = {
    def expr(c: UntypedExpression[T]): Column = new Column(c.expr)

    val groupByExprs = groupedBy.toList[UntypedExpression[T]].map(expr)
    val aggregates =
      if (retainGroupColumns) columns.toList[UntypedExpression[T]].map(expr)
      else groupByExprs ++ columns.toList[UntypedExpression[T]].map(expr)

    val aggregated =
      groupingFunc(self.dataset, groupByExprs)
        .agg(aggregates.head, aggregates.tail: _*)
        .as[Out1](TypedExpressionEncoder[Out1])

    TypedDataset.create[Out1](aggregated)
  }

  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    */
  object deserialized {
    def mapGroups[U: TypedEncoder](
      f: (KT, Iterator[T]) => U
    )(implicit e: TypedEncoder[KT]): TypedDataset[U] = {
      val func = (key: KT, it: Iterator[T]) => Iterator(f(key, it))
      flatMapGroups(func)
    }

    def flatMapGroups[U: TypedEncoder](
      f: (KT, Iterator[T]) => TraversableOnce[U]
    )(implicit e: TypedEncoder[KT]): TypedDataset[U] = {
      implicit val tendcoder = self.encoder

      val cols = groupedBy.toList[UntypedExpression[T]]
      val logicalPlan = FramelessInternals.logicalPlan(self.dataset)
      val withKeyColumns = logicalPlan.output ++ cols.map(_.expr).map(UnresolvedAlias(_))
      val withKey = Project(withKeyColumns, logicalPlan)
      val executed = FramelessInternals.executePlan(self.dataset, withKey)
      val keyAttributes = executed.analyzed.output.takeRight(cols.size)
      val dataAttributes = executed.analyzed.output.dropRight(cols.size)

      val mapGroups = MapGroups(
        f,
        keyAttributes,
        dataAttributes,
        executed.analyzed
      )(TypedExpressionEncoder[KT], TypedExpressionEncoder[T], TypedExpressionEncoder[U])

      val groupedAndFlatMapped = FramelessInternals.mkDataset(
        self.dataset.sqlContext,
        mapGroups,
        TypedExpressionEncoder[U]
      )

      TypedDataset.create(groupedAndFlatMapped)
    }
  }

  private def retainGroupColumns: Boolean = {
    self.dataset.sqlContext.getConf("spark.sql.retainGroupColumns", "true").toBoolean
  }

  def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[T, P]): PivotNotValues[T, TK, P] =
    PivotNotValues(self, groupedBy, pivotColumn)
}

private[ops] object AggregatingOps {
  /** Utility function to help Spark with serialization of closures */
  def tuple1[K1, V, U](f: (K1, Iterator[V]) => U): (Tuple1[K1], Iterator[V]) => U = {
    (x: Tuple1[K1], it: Iterator[V]) => f(x._1, it)
  }
}

/** Represents a typed Pivot operation.
  */
final case class Pivot[T, GroupedColumns <: HList, PivotType, Values <: HList](
  ds: TypedDataset[T],
  groupedBy: GroupedColumns,
  pivotedBy: TypedColumn[T, PivotType],
  values: Values
) {

  object agg extends ProductArgs {
    def applyProduct[AggrColumns <: HList, AggrColumnTypes <: HList, GroupedColumnTypes <: HList, NumValues <: Nat, TypesForPivotedValues <: HList, TypesForPivotedValuesOpt <: HList, OutAsHList <: HList, Out]
      (aggrColumns: AggrColumns)
      (implicit
        i0: AggregateTypes.Aux[T, AggrColumns, AggrColumnTypes],
        i1: ColumnTypes.Aux[T, GroupedColumns, GroupedColumnTypes],
        i2: Length.Aux[Values, NumValues],
        i3: Repeat.Aux[AggrColumnTypes, NumValues, TypesForPivotedValues],
        i4: Mapped.Aux[TypesForPivotedValues, Option, TypesForPivotedValuesOpt],
        i5: Prepend.Aux[GroupedColumnTypes, TypesForPivotedValuesOpt, OutAsHList],
        i6: Tupler.Aux[OutAsHList, Out],
        i7: TypedEncoder[Out]
      ): TypedDataset[Out] = {
        def mapAny[X](h: HList)(f: Any => X): List[X] =
          h match {
            case HNil    => Nil
            case x :: xs => f(x) :: mapAny(xs)(f)
          }

        val aggCols: Seq[Column] = mapAny(aggrColumns)(x => new Column(x.asInstanceOf[TypedAggregate[_,_]].expr))
        val tmp = ds.dataset.toDF()
          .groupBy(mapAny(groupedBy)(_.asInstanceOf[TypedColumn[_, _]].untyped): _*)
          .pivot(pivotedBy.untyped.toString, mapAny(values)(identity))
          .agg(aggCols.head, aggCols.tail:_*)
          .as[Out](TypedExpressionEncoder[Out])
        TypedDataset.create(tmp)
      }
  }
}

final case class PivotNotValues[T, GroupedColumns <: HList, PivotType](
  ds: TypedDataset[T],
  groupedBy: GroupedColumns,
  pivotedBy: TypedColumn[T, PivotType]
) extends ProductArgs {

  def onProduct[Values <: HList](values: Values)(
    implicit validValues: ToList[Values, PivotType] // validValues: FilterNot.Aux[Values, PivotType, HNil] // did not work
  ): Pivot[T, GroupedColumns, PivotType, Values] = Pivot(ds, groupedBy, pivotedBy, values)
}


================================================
FILE: dataset/src/main/scala/frameless/ops/RelationalGroupsOps.scala
================================================
package frameless
package ops

import org.apache.spark.sql.{Column, Dataset, RelationalGroupedDataset}
import shapeless.ops.hlist.{Mapped, Prepend, ToTraversable, Tupler}
import shapeless.{::, HList, HNil, ProductArgs}

/**
  * @param groupingFunc functions used to group elements, can be cube or rollup
  * @tparam T the original `TypedDataset's` type T
  * @tparam TK all columns chosen for aggregation
  * @tparam K individual columns' types as HList
  * @tparam KT individual columns' types as Tuple
  */
private[ops] abstract class RelationalGroupsOps[T, TK <: HList, K <: HList, KT]
  (self: TypedDataset[T], groupedBy: TK, groupingFunc: (Dataset[T], Seq[Column]) => RelationalGroupedDataset)
  (implicit
    i0: ColumnTypes.Aux[T, TK, K],
    i1: ToTraversable.Aux[TK, List, UntypedExpression[T]],
    i2: Tupler.Aux[K, KT]
  ) extends AggregatingOps(self, groupedBy, groupingFunc){

  object agg extends ProductArgs {
    /**
      * @tparam TC   resulting columns after aggregation function
      * @tparam C    individual columns' types as HList
      * @tparam OptK columns' types mapped to Option
      * @tparam Out0 OptK columns appended to C
      * @tparam Out1 output type
      */
    def applyProduct[TC <: HList, C <: HList, OptK <: HList, Out0 <: HList, Out1]
    (columns: TC)
    (implicit
      i3: AggregateTypes.Aux[T, TC, C], // shares individual columns' types after agg function as HList
      i4: Mapped.Aux[K, Option, OptK], // maps all original columns' types to Option
      i5: Prepend.Aux[OptK, C, Out0], // concatenates Option columns with those resulting from applying agg function
      i6: Tupler.Aux[Out0, Out1], // converts resulting HList into Tuple for output type
      i7: TypedEncoder[Out1], // proof that there is `TypedEncoder` for the output type
      i8: ToTraversable.Aux[TC, List, UntypedExpression[T]] // allows converting this HList to ordinary List
    ): TypedDataset[Out1] = {
      aggregate[TC, Out1](columns)
    }
  }
}

private[ops] abstract class RelationalGroups1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) {
  protected def underlying: RelationalGroupsOps[V, ::[TypedColumn[V, K1], HNil], ::[K1, HNil], Tuple1[K1]]
  private implicit def eg1 = g1.uencoder

  def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(Option[K1], U1)] = {
    implicit val e1 = c1.uencoder
    underlying.agg(c1)
  }

  def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(Option[K1], U1, U2)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder
    underlying.agg(c1, c2)
  }

  def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(Option[K1], U1, U2, U3)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder
    underlying.agg(c1, c2, c3)
  }

  def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(Option[K1], U1, U2, U3, U4)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder
    underlying.agg(c1, c2, c3, c4)
  }

  def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(Option[K1], U1, U2, U3, U4, U5)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder
    underlying.agg(c1, c2, c3, c4, c5)
  }

  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    */
  object deserialized {
    def mapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => U): TypedDataset[U] = {
      underlying.deserialized.mapGroups(AggregatingOps.tuple1(f))
    }

    def flatMapGroups[U: TypedEncoder](f: (K1, Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = {
      underlying.deserialized.flatMapGroups(AggregatingOps.tuple1(f))
    }
  }

  def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]): PivotNotValues[V, TypedColumn[V,K1] :: HNil, P] =
    PivotNotValues(self, g1 :: HNil, pivotColumn)
}

private[ops] abstract class RelationalGroups2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) {
  protected def underlying: RelationalGroupsOps[V, ::[TypedColumn[V, K1], ::[TypedColumn[V, K2], HNil]], ::[K1, ::[K2, HNil]], (K1, K2)]
  private implicit def eg1 = g1.uencoder
  private implicit def eg2 = g2.uencoder

  def agg[U1](c1: TypedAggregate[V, U1]): TypedDataset[(Option[K1], Option[K2], U1)] = {
    implicit val e1 = c1.uencoder
    underlying.agg(c1)
  }

  def agg[U1, U2](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2]): TypedDataset[(Option[K1], Option[K2], U1, U2)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder
    underlying.agg(c1, c2)
  }

  def agg[U1, U2, U3](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder
    underlying.agg(c1, c2, c3)
  }

  def agg[U1, U2, U3, U4](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3, U4)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder
    underlying.agg(c1 , c2 , c3 , c4)
  }

  def agg[U1, U2, U3, U4, U5](c1: TypedAggregate[V, U1], c2: TypedAggregate[V, U2], c3: TypedAggregate[V, U3], c4: TypedAggregate[V, U4], c5: TypedAggregate[V, U5]): TypedDataset[(Option[K1], Option[K2], U1, U2, U3, U4, U5)] = {
    implicit val e1 = c1.uencoder; implicit val e2 = c2.uencoder; implicit val e3 = c3.uencoder; implicit val e4 = c4.uencoder; implicit val e5 = c5.uencoder
    underlying.agg(c1, c2, c3, c4, c5)
  }

  /** Methods on `TypedDataset[T]` that go through a full serialization and
    * deserialization of `T`, and execute outside of the Catalyst runtime.
    */
  object deserialized {
    def mapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => U): TypedDataset[U] = {
      underlying.deserialized.mapGroups(f)
    }

    def flatMapGroups[U: TypedEncoder](f: ((K1, K2), Iterator[V]) => TraversableOnce[U]): TypedDataset[U] = {
      underlying.deserialized.flatMapGroups(f)
    }
  }

  def pivot[P: CatalystPivotable](pivotColumn: TypedColumn[V, P]):
  PivotNotValues[V, TypedColumn[V,K1] :: TypedColumn[V, K2] :: HNil, P] =
    PivotNotValues(self, g1 :: g2 :: HNil, pivotColumn)
}

class RollupManyOps[T, TK <: HList, K <: HList, KT](self: TypedDataset[T], groupedBy: TK)
  (implicit
    i0: ColumnTypes.Aux[T, TK, K],
    i1: ToTraversable.Aux[TK, List, UntypedExpression[T]],
    i2: Tupler.Aux[K, KT]
  ) extends RelationalGroupsOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.rollup(cols: _*))

class Rollup1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) extends RelationalGroups1Ops(self, g1) {
  override protected def underlying = new RollupManyOps(self, g1 :: HNil)
}

class Rollup2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) extends RelationalGroups2Ops(self, g1, g2) {
  override protected def underlying = new RollupManyOps(self, g1 :: g2 :: HNil)
}

class CubeManyOps[T, TK <: HList, K <: HList, KT](self: TypedDataset[T], groupedBy: TK)
  (implicit
    i0: ColumnTypes.Aux[T, TK, K],
    i1: ToTraversable.Aux[TK, List, UntypedExpression[T]],
    i2: Tupler.Aux[K, KT]
  ) extends RelationalGroupsOps[T, TK, K, KT](self, groupedBy, (dataset, cols) => dataset.cube(cols: _*))

class Cube1Ops[K1, V](self: TypedDataset[V], g1: TypedColumn[V, K1]) extends RelationalGroups1Ops(self, g1) {
  override protected def underlying = new CubeManyOps(self, g1 :: HNil)
}

class Cube2Ops[K1, K2, V](self: TypedDataset[V], g1: TypedColumn[V, K1], g2: TypedColumn[V, K2]) extends RelationalGroups2Ops(self, g1, g2) {
  override protected def underlying = new CubeManyOps(self, g1 :: g2 :: HNil)
}


================================================
FILE: dataset/src/main/scala/frameless/ops/Repeat.scala
================================================
package frameless
package ops

import shapeless.{HList, Nat, Succ}
import shapeless.ops.hlist.Prepend

/** Typeclass supporting repeating L-typed HLists N times.
  *
  * Repeat[Int :: String :: HNil, Nat._2].Out =:=
  * Int :: String :: Int :: String :: HNil
  *
  * By Jeremy Smith. To be replaced by `shapeless.ops.hlists.Repeat`
  * once (https://github.com/milessabin/shapeless/pull/730 is published.
  */
trait Repeat[L <: HList, N <: Nat] {
  type Out <: HList
}

object Repeat {
  type Aux[L <: HList, N <: Nat, Out0 <: HList] = Repeat[L, N] { type Out = Out0 }

  implicit def base[L <: HList]: Aux[L, Nat._1, L] = new Repeat[L, Nat._1] {
    type Out = L
  }

  implicit def succ[L <: HList, Prev <: Nat, PrevOut <: HList, P <: HList]
    (implicit
      i0: Aux[L, Prev, PrevOut],
      i1: Prepend.Aux[L, PrevOut, P]
    ): Aux[L, Succ[Prev], P] = new Repeat[L, Succ[Prev]] {
      type Out = P
    }
}


================================================
FILE: dataset/src/main/scala/frameless/ops/SmartProject.scala
================================================
package frameless
package ops

import shapeless.ops.hlist.ToTraversable
import shapeless.ops.record.{Keys, SelectAll, Values}
import shapeless.{HList, LabelledGeneric}

import scala.annotation.implicitNotFound

@implicitNotFound(msg = "Cannot prove that ${T} can be projected to ${U}. Perhaps not all member names and types of ${U} are the same in ${T}?")
case class SmartProject[T: TypedEncoder, U: TypedEncoder](apply: TypedDataset[T] => TypedDataset[U])

object SmartProject {
  /**
    * Proofs that there is a type-safe projection from a type T to another type U. It requires that:
    * (a) both T and U are Products for which a LabelledGeneric can be derived (e.g., case classes),
    * (b) all members of U have a corresponding member in T that has both the same name and type.
    *
    * @param i0 the LabelledGeneric derived for T
    * @param i1 the LabelledGeneric derived for U
    * @param i2 the keys of U
    * @param i3 selects all the values from T using the keys of U
    * @param i4 selects all the values of LabeledGeneric[U]
    * @param i5 proof that U and the projection of T have the same type
    * @param i6 allows for traversing the keys of U
    * @tparam T the original type T
    * @tparam U the projected type U
    * @tparam TRec shapeless' Record representation of T
    * @tparam TProj the projection of T using the keys of U
    * @tparam URec shapeless' Record representation of U
    * @tparam UVals the values of U as an HList
    * @tparam UKeys the keys of U as an HList
    * @return a projection if it exists
    */
  implicit def deriveProduct[T: TypedEncoder, U: TypedEncoder, TRec <: HList, TProj <: HList, URec <: HList, UVals <: HList, UKeys <: HList]
    (implicit
      i0: LabelledGeneric.Aux[T, TRec],
      i1: LabelledGeneric.Aux[U, URec],
      i2: Keys.Aux[URec, UKeys],
      i3: SelectAll.Aux[TRec, UKeys, TProj],
      i4: Values.Aux[URec, UVals],
      i5: UVals =:= TProj,
      i6: ToTraversable.Aux[UKeys, Seq, Symbol]
    ): SmartProject[T,U] = SmartProject[T, U]({ from =>
      val names = implicitly[Keys.Aux[URec, UKeys]].apply().to[Seq].map(_.name).map(from.dataset.col)
      TypedDataset.create(from.dataset.toDF().select(names: _*).as[U](TypedExpressionEncoder[U]))
    })
}


================================================
FILE: dataset/src/main/scala/frameless/syntax/package.scala
================================================
package frameless

package object syntax extends FramelessSyntax {
  implicit val DefaultSparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob
}


================================================
FILE: dataset/src/main/scala/org/apache/spark/sql/FramelessInternals.scala
================================================
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.{Alias, CreateStruct}
import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.ObjectType
import scala.reflect.ClassTag

object FramelessInternals {
  def objectTypeFor[A](implicit classTag: ClassTag[A]): ObjectType = ObjectType(classTag.runtimeClass)

  def resolveExpr(ds: Dataset[_], colNames: Seq[String]): NamedExpression = {
    ds.toDF().queryExecution.analyzed.resolve(colNames, ds.sparkSession.sessionState.analyzer.resolver).getOrElse {
      throw new AnalysisException(
        s"""Cannot resolve column name "$colNames" among (${ds.schema.fieldNames.mkString(", ")})""")
    }
  }

  def expr(column: Column): Expression = column.expr

  def logicalPlan(ds: Dataset[_]): LogicalPlan = ds.logicalPlan

  def executePlan(ds: Dataset[_], plan: LogicalPlan): QueryExecution =
    ds.sparkSession.sessionState.executePlan(plan)

  def joinPlan(ds: Dataset[_], plan: LogicalPlan, leftPlan: LogicalPlan, rightPlan: LogicalPlan): LogicalPlan = {
    val joined = executePlan(ds, plan)
    val leftOutput = joined.analyzed.output.take(leftPlan.output.length)
    val rightOutput = joined.analyzed.output.takeRight(rightPlan.output.length)

    Project(List(
      Alias(CreateStruct(leftOutput), "_1")(),
      Alias(CreateStruct(rightOutput), "_2")()
    ), joined.analyzed)
  }

  def mkDataset[T](sqlContext: SQLContext, plan: LogicalPlan, encoder: Encoder[T]): Dataset[T] =
    new Dataset(sqlContext, plan, encoder)

  def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame =
    Dataset.ofRows(sparkSession, logicalPlan)

  // because org.apache.spark.sql.types.UserDefinedType is private[spark]
  type UserDefinedType[A >: Null] =  org.apache.spark.sql.types.UserDefinedType[A]

  // below only tested in SelfJoinTests.colLeft and colRight are equivalent to col outside of joins
  //  - via files (codegen) forces doGenCode eval.
  /** Expression to tag columns from the left hand side of join expression. */
  case class DisambiguateLeft[T](tagged: Expression) extends Expression with NonSQLExpression {
    def eval(input: InternalRow): Any = tagged.eval(input)
    def nullable: Boolean = false
    def children: Seq[Expression] = tagged :: Nil
    def dataType: DataType = tagged.dataType
    protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx)
    protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head)
  }

  /** Expression to tag columns from the right hand side of join expression. */
  case class DisambiguateRight[T](tagged: Expression) extends Expression with NonSQLExpression {
    def eval(input: InternalRow): Any = tagged.eval(input)
    def nullable: Boolean = false
    def children: Seq[Expression] = tagged :: Nil
    def dataType: DataType = tagged.dataType
    protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = tagged.genCode(ctx)
    protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = copy(newChildren.head)
  }
}


================================================
FILE: dataset/src/main/scala/org/apache/spark/sql/reflection/package.scala
================================================
package org.apache.spark.sql

import org.apache.spark.sql.catalyst.ScalaReflection.{
  cleanUpReflectionObjects,
  getClassFromType,
  localTypeOf
}
import org.apache.spark.sql.types.{
  BinaryType,
  BooleanType,
  ByteType,
  CalendarIntervalType,
  DataType,
  Decimal,
  DecimalType,
  DoubleType,
  FloatType,
  IntegerType,
  LongType,
  NullType,
  ObjectType,
  ShortType
}
import org.apache.spark.unsafe.types.CalendarInterval

/**
 * Copy of spark's pre 3.4 reflection based encoding
 */
package object reflection {

  /**
   * copy of pre 3.5.0 isNativeType, https://issues.apache.org/jira/browse/SPARK-44343 removed it
   */
  def isNativeType(dt: DataType): Boolean = dt match {
    case NullType | BooleanType | ByteType | ShortType | IntegerType |
        LongType | FloatType | DoubleType | BinaryType | CalendarIntervalType =>
      true
    case _ => false
  }

  private object ScalaSubtypeLock

  val universe: scala.reflect.runtime.universe.type =
    scala.reflect.runtime.universe

  import universe._

  /**
   * Returns the Spark SQL DataType for a given scala type.  Where this is not an exact mapping
   * to a native type, an ObjectType is returned. Special handling is also used for Arrays including
   * those that hold primitive types.
   *
   * Unlike `schemaFor`, this function doesn't do any massaging of types into the Spark SQL type
   * system.  As a result, ObjectType will be returned for things like boxed Integers
   */
  def dataTypeFor[T: TypeTag]: DataType = dataTypeFor(localTypeOf[T])

  /**
   * Synchronize to prevent concurrent usage of `<:<` operator.
   * This operator is not thread safe in any current version of scala; i.e.
   * (2.11.12, 2.12.10, 2.13.0-M5).
   *
   * See https://github.com/scala/bug/issues/10766
   */
  private[sql] def isSubtype(tpe1: `Type`, tpe2: `Type`): Boolean = {
    ScalaSubtypeLock.synchronized {
      tpe1 <:< tpe2
    }
  }

  private def dataTypeFor(tpe: `Type`): DataType = cleanUpReflectionObjects {
    tpe.dealias match {
      case t if isSubtype(t, definitions.NullTpe)      => NullType
      case t if isSubtype(t, definitions.IntTpe)       => IntegerType
      case t if isSubtype(t, definitions.LongTpe)      => LongType
      case t if isSubtype(t, definitions.DoubleTpe)    => DoubleType
      case t if isSubtype(t, definitions.FloatTpe)     => FloatType
      case t if isSubtype(t, definitions.ShortTpe)     => ShortType
      case t if isSubtype(t, definitions.ByteTpe)      => ByteType
      case t if isSubtype(t, definitions.BooleanTpe)   => BooleanType
      case t if isSubtype(t, localTypeOf[Array[Byte]]) => BinaryType
      case t if isSubtype(t, localTypeOf[CalendarInterval]) =>
        CalendarIntervalType
      case t if isSubtype(t, localTypeOf[Decimal]) => DecimalType.SYSTEM_DEFAULT
      case _                                       =>
        /* original Spark code checked for scala.Array vs ObjectType,
           this (and associated code) isn't needed due to TypedEncoders arrayEncoder */
        val clazz = getClassFromType(tpe)
        ObjectType(clazz)
    }
  }

}


================================================
FILE: dataset/src/main/spark-3/frameless/MapGroups.scala
================================================
package frameless

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MapGroups => SMapGroups}

object MapGroups {
  def apply[K: Encoder, T: Encoder, U: Encoder](
    func: (K, Iterator[T]) => TraversableOnce[U],
    groupingAttributes: Seq[Attribute],
    dataAttributes: Seq[Attribute],
    child: LogicalPlan
  ): LogicalPlan = SMapGroups(func, groupingAttributes, dataAttributes, child)
}


================================================
FILE: dataset/src/main/spark-3.4+/frameless/MapGroups.scala
================================================
package frameless

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MapGroups => SMapGroups}

object MapGroups {
  def apply[K: Encoder, T: Encoder, U: Encoder](
    func: (K, Iterator[T]) => TraversableOnce[U],
    groupingAttributes: Seq[Attribute],
    dataAttributes: Seq[Attribute],
    child: LogicalPlan
  ): LogicalPlan =
    SMapGroups(
      func,
      groupingAttributes,
      dataAttributes,
      Seq(), // #698 - no order given
      child
    )
}


================================================
FILE: dataset/src/test/resources/log4j.properties
================================================
log4j.logger.akka.event.slf4j.Slf4jLogger=ERROR
log4j.logger.akka.event.slf4j=ERROR
log4j.logger.akka.remote.EndpointWriter=ERROR
log4j.logger.akka.remote.RemoteActorRefProvider$RemotingTerminator=ERROR
log4j.logger.com.anjuke.dm=ERROR
log4j.logger.io.netty.bootstrap.ServerBootstrap=ERROR
log4j.logger.io.netty.buffer.ByteBufUtil=ERROR
log4j.logger.io.netty.buffer.PooledByteBufAllocator=ERROR
log4j.logger.io.netty.channel.AbstractChannel=ERROR
log4j.logger.io.netty.channel.ChannelInitializer=ERROR
log4j.logger.io.netty.channel.ChannelOutboundBuffer=ERROR
log4j.logger.io.netty.channel.DefaultChannelPipeline=ERROR
log4j.logger.io.netty.channel.MultithreadEventLoopGroup=ERROR
log4j.logger.io.netty.channel.nio.AbstractNioChannel=ERROR
log4j.logger.io.netty.channel.nio.NioEventLoop=ERROR
log4j.logger.io.netty.channel.socket.nio.NioServerSocketChannel=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise.rejectedExecution=ERROR
log4j.logger.io.netty.util.concurrent.DefaultPromise=ERROR
log4j.logger.io.netty.util.concurrent.GlobalEventExecutor=ERROR
log4j.logger.io.netty.util.concurrent.SingleThreadEventExecutor=ERROR
log4j.logger.io.netty.util.internal.logging.InternalLoggerFactory=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent0=ERROR
log4j.logger.io.netty.util.internal.PlatformDependent=ERROR
log4j.logger.io.netty.util.internal.SystemPropertyUtil=ERROR
log4j.logger.io.netty.util.internal.ThreadLocalRandom=ERROR
log4j.logger.io.netty.util.NetUtil=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR
log4j.logger.org.apache.hadoop.conf.Configuration=ERROR
log4j.logger.org.apache.hadoop.fs.FileSystem=ERROR
log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=ERROR
log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
log4j.logger.org.apache.hadoop.mapred.JobConf=ERROR
log4j.logger.org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedPartitioner=ERROR
log4j.logger.org.apache.hadoop.metrics2.impl.MetricsSystemImpl=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.Interns=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MetricsSourceBuilder=ERROR
log4j.logger.org.apache.hadoop.metrics2.lib.MutableMetricsFactory=ERROR
log4j.logger.org.apache.hadoop.security.authentication.util.KerberosName=ERROR
log4j.logger.org.apache.hadoop.security.Groups=ERROR
log4j.logger.org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback=ERROR
log4j.logger.org.apache.hadoop.security.SecurityUtil=ERROR
log4j.logger.org.apache.hadoop.security.ShellBasedUnixGroupsMapping=ERROR
log4j.logger.org.apache.hadoop.security.UserGroupInformation=ERROR
log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
log4j.logger.org.apache.hadoop.util.ShutdownHookManager=ERROR
log4j.logger.org.apache.spark.broadcast.TorrentBroadcast=ERROR
log4j.logger.org.apache.spark.ContextCleaner=ERROR
log4j.logger.org.apache.spark.executor.Executor=ERROR
log4j.logger.org.apache.spark.HeartbeatReceiver=ERROR
log4j.logger.org.apache.spark.HttpFileServer=ERROR
log4j.logger.org.apache.spark.HttpServer=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMaster=ERROR
log4j.logger.org.apache.spark.MapOutputTrackerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.metrics.MetricsSystem=ERROR
log4j.logger.org.apache.spark.network.client.TransportClientFactory=ERROR
log4j.logger.org.apache.spark.network.netty.NettyBlockTransferService=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageDecoder=ERROR
log4j.logger.org.apache.spark.network.protocol.MessageEncoder=ERROR
log4j.logger.org.apache.spark.network.server.OneForOneStreamManager=ERROR
log4j.logger.org.apache.spark.network.server.TransportServer=ERROR
log4j.logger.org.apache.spark.network.TransportContext=ERROR
log4j.logger.org.apache.spark.network.util.JavaUtils=ERROR
log4j.logger.org.apache.spark.rdd.CoGroupedRDD=ERROR
log4j.logger.org.apache.spark.rdd.SubtractedRDD=ERROR
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
log4j.logger.org.apache.spark.rpc.akka.AkkaRpcEnv$$anonfun$actorRef$lzycompute$1$1$$anon$1=ERROR
log4j.logger.org.apache.spark.scheduler.DAGScheduler=ERROR
log4j.logger.org.apache.spark.scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSchedulerImpl=ERROR
log4j.logger.org.apache.spark.scheduler.TaskSetManager=ERROR
log4j.logger.org.apache.spark.SecurityManager=ERROR
log4j.logger.org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter=ERROR
log4j.logger.org.apache.spark.SparkContext=ERROR
log4j.logger.org.apache.spark.SparkEnv=ERROR
log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveReferences=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner=ERROR
log4j.logger.org.apache.spark.sql.catalyst.expressions.codegen.package$ExpressionCanonicalizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.optimizer.DefaultOptimizer=ERROR
log4j.logger.org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.SortBasedAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.aggregate.TungstenAggregate=ERROR
log4j.logger.org.apache.spark.sql.execution.Exchange=ERROR
log4j.logger.org.apache.spark.sql.execution.joins.ShuffledHashOuterJoin=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$1=ERROR
log4j.logger.org.apache.spark.sql.SQLContext$$anon$2=ERROR
log4j.logger.org.apache.spark.SSLOptions=ERROR
log4j.logger.org.apache.spark.storage.BlockManager=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerInfo=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMaster=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerMasterEndpoint=ERROR
log4j.logger.org.apache.spark.storage.BlockManagerSlaveEndpoint=ERROR
log4j.logger.org.apache.spark.storage.DiskBlockManager=ERROR
log4j.logger.org.apache.spark.storage.MemoryStore=ERROR
log4j.logger.org.apache.spark.storage.ShuffleBlockFetcherIterator=ERROR
log4j.logger.org.apache.spark.ui.SparkUI=ERROR
log4j.logger.org.apache.spark.unsafe.map.BytesToBytesMap=ERROR
log4j.logger.org.apache.spark.unsafe.memory.TaskMemoryManager=ERROR
log4j.logger.org.apache.spark.util.AkkaUtils=ERROR
log4j.logger.org.apache.spark.util.ClosureCleaner=ERROR
log4j.logger.org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter=ERROR
log4j.logger.org.apache.spark.util.Utils=ERROR
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.eclipse.jetty=ERROR
log4j.logger.org.spark-project.jetty.http.AbstractGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.HttpGenerator=ERROR
log4j.logger.org.spark-project.jetty.http.MimeTypes=ERROR
log4j.logger.org.spark-project.jetty.io.AbstractBuffer=ERROR
log4j.logger.org.spark-project.jetty.io.nio=ERROR
log4j.logger.org.spark-project.jetty.server.AbstractConnector=ERROR
log4j.logger.org.spark-project.jetty.server.bio.SocketConnector=ERROR
log4j.logger.org.spark-project.jetty.server.handler.AbstractHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ContextHandlerCollection=ERROR
log4j.logger.org.spark-project.jetty.server.handler.DefaultHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ErrorHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.GzipHandler=ERROR
log4j.logger.org.spark-project.jetty.server.handler.ResourceHandler=ERROR
log4j.logger.org.spark-project.jetty.server.Server=ERROR
log4j.logger.org.spark-project.jetty.server=ERROR
log4j.logger.org.spark-project.jetty.servlet.DefaultServlet=ERROR
log4j.logger.org.spark-project.jetty.servlet.Holder=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHandler=ERROR
log4j.logger.org.spark-project.jetty.servlet.ServletHolder=ERROR
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.AggregateLifeCycle=ERROR
log4j.logger.org.spark-project.jetty.util.component.Container=ERROR
log4j.logger.org.spark-project.jetty.util.IO=ERROR
log4j.logger.org.spark-project.jetty.util.log=ERROR
log4j.logger.org.spark-project.jetty.util.resource.FileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarFileResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.JarResource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.Resource=ERROR
log4j.logger.org.spark-project.jetty.util.resource.URLResource=ERROR
log4j.logger.org.spark-project.jetty.util.StringUtil=ERROR
log4j.logger.org.spark-project.jetty.util.thread.QueuedThreadPool=ERROR
log4j.logger.org.spark-project.jetty.util.thread.Timeout=ERROR
log4j.logger.org.spark-project.jetty=ERROR
log4j.logger.Remoting=ERROR

================================================
FILE: dataset/src/test/resources/log4j2.properties
================================================
# Set to debug or trace if log4j initialization is failing
status = warn

# Name of the configuration
name = ConsoleAppender

# Console appender configuration
appender.console.type = Console
appender.console.name = consoleLogger
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{YYYY-MM-dd HH:mm:ss} [%t] %-5p %c:%L - %m%n
appender.console.target = SYSTEM_OUT

# Root logger level
rootLogger.level = error

# Root logger referring to console appender
rootLogger.appenderRef.stdout.ref = consoleLogger

logger.spark.name = org.apache.spark
logger.spark.level = warn

logger.hadoop.name = org.apache.hadoop
logger.hadoop.level = warn

# To debug expressions:
#logger.codegen.name = org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
#logger.codegen.level = debug

================================================
FILE: dataset/src/test/scala/frameless/AsTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class AsTests extends TypedDatasetSuite {
  test("as[X2[A, B]]") {
    def prop[A, B](data: Vector[(A, B)])(
      implicit
      eab: TypedEncoder[(A, B)],
      ex2: TypedEncoder[X2[A, B]]
    ): Prop = {
      val dataset = TypedDataset.create(data)

      val dataset2 = dataset.as[X2[A,B]]().collect().run().toVector
      val data2 = data.map { case (a, b) => X2(a, b) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int] _))
    check(forAll(prop[String, String] _))
    check(forAll(prop[String, Int] _))
    check(forAll(prop[Long, Int] _))
    check(forAll(prop[Seq[Seq[Option[Seq[Long]]]], Seq[Int]] _))
    check(forAll(prop[Seq[Option[Seq[String]]], Seq[Int]] _))
  }

  test("as[X2[X2[A, B], C]") {
    def prop[A, B, C](data: Vector[(A, B, C)])(
      implicit
      eab: TypedEncoder[((A, B), C)],
      ex2: TypedEncoder[X2[X2[A, B], C]]
    ): Prop = {
      val data2 = data.map {
        case (a, b, c) => ((a, b), c)
      }
      val dataset = TypedDataset.create(data2)

      val dataset2 = dataset.as[X2[X2[A,B], C]]().collect().run().toVector
      val data3 = data2.map { case ((a, b), c) => X2(X2(a, b), c) }

      dataset2 ?= data3
    }

    check(forAll(prop[String, Int, Int] _))
    check(forAll(prop[String, Int, String] _))
    check(forAll(prop[String, String, Int] _))
    check(forAll(prop[Long, Int, String] _))
    check(forAll(prop[Seq[Seq[Option[Seq[Long]]]], Seq[Int], Option[Seq[Option[Int]]]] _))
    check(forAll(prop[Seq[Option[Seq[String]]], Seq[Int], Seq[Option[String]]] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/BitwiseTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers

class BitwiseTests extends TypedDatasetSuite with Matchers {

  /**
    * providing instances with implementations for bitwise operations since in the tests
    * we need to check the results from frameless vs the results from normal scala operators
    * for Numeric it is easy to test since scala comes with Numeric typeclass but there seems
    * to be no equivalent typeclass for bitwise ops for Byte Short Int and Long types supported in Catalyst
    */
  trait CatalystBitwise4Tests[A]{
    def bitwiseAnd(a1: A, a2: A): A
    def bitwiseOr(a1: A, a2: A): A
    def bitwiseXor(a1: A, a2: A): A
    def &(a1: A, a2: A): A = bitwiseAnd(a1, a2)
    def |(a1: A, a2: A): A = bitwiseOr(a1, a2)
    def ^(a1: A, a2: A): A = bitwiseXor(a1, a2)
  }

  object CatalystBitwise4Tests {
    implicit val framelessbyteBitwise      : CatalystBitwise4Tests[Byte]       = new CatalystBitwise4Tests[Byte] {
      def bitwiseOr(a1: Byte, a2: Byte) : Byte = (a1 | a2).toByte
      def bitwiseAnd(a1: Byte, a2: Byte): Byte = (a1 & a2).toByte
      def bitwiseXor(a1: Byte, a2: Byte): Byte = (a1 ^ a2).toByte
    }
    implicit val framelessshortBitwise     : CatalystBitwise4Tests[Short]      = new CatalystBitwise4Tests[Short] {
      def bitwiseOr(a1: Short, a2: Short) : Short = (a1 | a2).toShort
      def bitwiseAnd(a1: Short, a2: Short): Short = (a1 & a2).toShort
      def bitwiseXor(a1: Short, a2: Short): Short = (a1 ^ a2).toShort
    }
    implicit val framelessintBitwise       : CatalystBitwise4Tests[Int]        = new CatalystBitwise4Tests[Int] {
      def bitwiseOr(a1: Int, a2: Int) : Int = a1 | a2
      def bitwiseAnd(a1: Int, a2: Int): Int = a1 & a2
      def bitwiseXor(a1: Int, a2: Int): Int = a1 ^ a2
    }
    implicit val framelesslongBitwise      : CatalystBitwise4Tests[Long]       = new CatalystBitwise4Tests[Long] {
      def bitwiseOr(a1: Long, a2: Long) : Long = a1 | a2
      def bitwiseAnd(a1: Long, a2: Long): Long = a1 & a2
      def bitwiseXor(a1: Long, a2: Long): Long = a1 ^ a2
    }

  }
  import CatalystBitwise4Tests._
  test("bitwiseAND") {
    def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)(
      implicit catalystBitwise4Tests: CatalystBitwise4Tests[A]
    ): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[CatalystBitwise4Tests[A]].bitwiseAnd(a, b)
      val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].&(a, b)
      val got = df.select(df.col('a) bitwiseAND df.col('b)).collect().run()
      val gotSymbolic = df.select(df.col('a) & b).collect().run()
      val symbolicCol2Col = df.select(df.col('a) & df.col('b)).collect().run()
      val canCast = df.select(df.col('a).cast[Long] & 0L).collect().run()
      canCast should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(0L)
      result ?= resultSymbolic
      symbolicCol2Col ?= (result :: Nil)
      got ?= (result :: Nil)
      gotSymbolic ?= (resultSymbolic :: Nil)
    }

    check(prop[Byte] _)
    check(prop[Short] _)
    check(prop[Int] _)
    check(prop[Long] _)
  }

  test("bitwiseOR") {
    def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)(
      implicit catalystBitwise4Tests: CatalystBitwise4Tests[A]
    ): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[CatalystBitwise4Tests[A]].bitwiseOr(a, b)
      val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].|(a, b)
      val got = df.select(df.col('a) bitwiseOR df.col('b)).collect().run()
      val gotSymbolic = df.select(df.col('a) | b).collect().run()
      val symbolicCol2Col = df.select(df.col('a) | df.col('b)).collect().run()
      val canCast = df.select(df.col('a).cast[Long] | -1L).collect().run()
      canCast should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(-1L)
      result ?= resultSymbolic
      symbolicCol2Col ?= (result :: Nil)
      got ?= (result :: Nil)
      gotSymbolic ?= (resultSymbolic :: Nil)
    }

    check(prop[Byte] _)
    check(prop[Short] _)
    check(prop[Int] _)
    check(prop[Long] _)
  }

  test("bitwiseXOR") {
    def prop[A: TypedEncoder: CatalystBitwise](a: A, b: A)(
      implicit catalystBitwise4Tests: CatalystBitwise4Tests[A]
    ): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[CatalystBitwise4Tests[A]].bitwiseXor(a, b)
      val resultSymbolic = implicitly[CatalystBitwise4Tests[A]].^(a, b)
      result ?= resultSymbolic
      val got = df.select(df.col('a) bitwiseXOR df.col('b)).collect().run()
      val gotSymbolic = df.select(df.col('a) ^ b).collect().run()
      val zeroes = df.select(df.col('a) ^ df.col('a)).collect().run()
      zeroes should contain theSameElementsAs Seq.fill[Long](gotSymbolic.size)(0L)
      got ?= (result :: Nil)
      gotSymbolic ?= (resultSymbolic :: Nil)
    }

    check(prop[Byte] _)
    check(prop[Short] _)
    check(prop[Int] _)
    check(prop[Long] _)
  }
}


================================================
FILE: dataset/src/test/scala/frameless/CastTests.scala
================================================
package frameless

import org.scalacheck.{Arbitrary, Gen, Prop}
import org.scalacheck.Prop._

class CastTests extends TypedDatasetSuite {

  def prop[A: TypedEncoder, B: TypedEncoder](f: A => B)(a: A)(
    implicit
    cast: CatalystCast[A, B]
  ): Prop = {
    val df = TypedDataset.create(X1(a) :: Nil)
    val got = df.select(df.col('a).cast[B]).collect().run()

    got ?= (f(a) :: Nil)
  }

  test("cast") {
    // numericToDecimal
    check(prop[BigDecimal, BigDecimal](identity) _)
    check(prop[Byte, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _)
    check(prop[Double, BigDecimal](BigDecimal.valueOf) _)
    check(prop[Int, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _)
    check(prop[Long, BigDecimal](BigDecimal.valueOf) _)
    check(prop[Short, BigDecimal](x => BigDecimal.valueOf(x.toLong)) _)

    // numericToByte
    check(prop[BigDecimal, Byte](_.toByte) _)
    check(prop[Byte, Byte](identity) _)
    check(prop[Double, Byte](_.toByte) _)
    check(prop[Int, Byte](_.toByte) _)
    check(prop[Long, Byte](_.toByte) _)
    check(prop[Short, Byte](_.toByte) _)

    // numericToDouble
    check(prop[BigDecimal, Double](_.toDouble) _)
    check(prop[Byte, Double](_.toDouble) _)
    check(prop[Double, Double](identity) _)
    check(prop[Int, Double](_.toDouble) _)
    check(prop[Long, Double](_.toDouble) _)
    check(prop[Short, Double](_.toDouble) _)

    // numericToInt
    check(prop[BigDecimal, Int](_.toInt) _)
    check(prop[Byte, Int](_.toInt) _)
    check(prop[Double, Int](_.toInt) _)
    check(prop[Int, Int](identity) _)
    check(prop[Long, Int](_.toInt) _)
    check(prop[Short, Int](_.toInt) _)

    // numericToLong
    check(prop[BigDecimal, Long](_.toLong) _)
    check(prop[Byte, Long](_.toLong) _)
    check(prop[Double, Long](_.toLong) _)
    check(prop[Int, Long](_.toLong) _)
    check(prop[Long, Long](identity) _)
    check(prop[Short, Long](_.toLong) _)

    // numericToShort
    check(prop[BigDecimal, Short](_.toShort) _)
    check(prop[Byte, Short](_.toShort) _)
    check(prop[Double, Short](_.toShort) _)
    check(prop[Int, Short](_.toShort) _)
    check(prop[Long, Short](_.toShort) _)
    check(prop[Short, Short](identity) _)

    // castToString
    // TODO compare without trailing zeros
    // check(prop[BigDecimal, String](_.toString()) _)
    check(prop[Byte, String](_.toString) _)
    check(prop[Double, String](_.toString) _)
    check(prop[Int, String](_.toString) _)
    check(prop[Long, String](_.toString) _)
    check(prop[Short, String](_.toString) _)

    // stringToBoolean
    val trueStrings = Set("t", "true", "y", "yes", "1")
    val falseStrings = Set("f", "false", "n", "no", "0")

    def stringToBoolean(str: String): Option[Boolean] = {
      if (trueStrings(str)) Some(true)
      else if (falseStrings(str)) Some(false)
      else None
    }

    val stringToBooleanGen = Gen.oneOf(
      Gen.oneOf(trueStrings.toSeq),
      Gen.oneOf(falseStrings.toSeq),
      Arbitrary.arbitrary[String]
    )

    check(forAll(stringToBooleanGen)(prop(stringToBoolean)))

    // xxxToBoolean
    check(prop[BigDecimal, Boolean](_ != BigDecimal(0)) _)
    check(prop[Byte, Boolean](_ != 0) _)
    check(prop[Double, Boolean](_ != 0) _)
    check(prop[Int, Boolean](_ != 0) _)
    check(prop[Long, Boolean](_ != 0L) _)
    check(prop[Short, Boolean](_ != 0) _)

    // booleanToNumeric
    check(prop[Boolean, BigDecimal](x => if (x) BigDecimal(1) else BigDecimal(0)) _)
    check(prop[Boolean, Byte](x => if (x) 1 else 0) _)
    check(prop[Boolean, Double](x => if (x) 1.0f else 0.0f) _)
    check(prop[Boolean, Int](x => if (x) 1 else 0) _)
    check(prop[Boolean, Long](x => if (x) 1L else 0L) _)
    check(prop[Boolean, Short](x => if (x) 1 else 0) _)
  }

}


================================================
FILE: dataset/src/test/scala/frameless/ColTests.scala
================================================
package frameless

import shapeless.test.illTyped

import org.scalacheck.Prop
import org.scalacheck.Prop._

class ColTests extends TypedDatasetSuite {
  test("col") {
    val x4 = TypedDataset.create[X4[Int, String, Long, Boolean]](Nil)
    val t4 = TypedDataset.create[(Int, String, Long, Boolean)](Nil)

    x4.col('a)
    t4.col('_1)

    x4.col[Int]('a)
    t4.col[Int]('_1)

    illTyped("x4.col[String]('a)", "No column .* of type String in frameless.X4.*")

    x4.col('b)
    t4.col('_2)

    x4.col[String]('b)
    t4.col[String]('_2)

    illTyped("x4.col[Int]('b)", "No column .* of type Int in frameless.X4.*")

    ()
  }

  test("colMany") {
    type X2X2 = X2[X2[Int, String], X2[Long, Boolean]]
    val x2x2 = TypedDataset.create[X2X2](Nil)

    val aa: TypedColumn[X2X2, Int] = x2x2.colMany('a, 'a)
    val ab: TypedColumn[X2X2, String] = x2x2.colMany('a, 'b)
    val ba: TypedColumn[X2X2, Long] = x2x2.colMany('b, 'a)
    val bb: TypedColumn[X2X2, Boolean] = x2x2.colMany('b, 'b)

    illTyped("x2x2.colMany('a, 'c)")
    illTyped("x2x2.colMany('a, 'a, 'a)")
  }

  test("select colMany") {
    def prop[A: TypedEncoder](x: X2[X2[A, A], A]): Prop = {
      val df = TypedDataset.create(x :: Nil)
      val got = df.select(df.colMany('a, 'a)).collect().run()

      got ?= (x.a.a :: Nil)
    }

    check(prop[Int] _)
    check(prop[X2[Int, Int]] _)
    check(prop[X2[X2[Int, Int], Int]] _)
  }
}


================================================
FILE: dataset/src/test/scala/frameless/CollectTests.scala
================================================
package frameless

import frameless.CollectTests.{ prop, propArray }
import org.apache.spark.sql.SparkSession
import org.scalacheck.Prop
import org.scalacheck.Prop._
import scala.reflect.ClassTag

class CollectTests extends TypedDatasetSuite {
  test("collect()") {
    check(forAll(propArray[Int] _))
    check(forAll(propArray[Long] _))
    check(forAll(propArray[Boolean] _))
    check(forAll(propArray[Float] _))
    check(forAll(propArray[String] _))
    check(forAll(propArray[Byte] _))
    check(forAll(propArray[Option[Int]] _))
    check(forAll(propArray[Option[Long]] _))
    check(forAll(propArray[Option[Double]] _))
    check(forAll(propArray[Option[Float]] _))
    check(forAll(propArray[Option[Short]] _))
    check(forAll(propArray[Option[Byte]] _))
    check(forAll(propArray[Option[Boolean]] _))
    check(forAll(propArray[Option[String]] _))

    check(forAll(prop[X2[Int, Int]] _))
    check(forAll(prop[X2[String, String]] _))
    check(forAll(prop[X2[String, Int]] _))
    check(forAll(prop[X2[Long, Int]] _))

    check(forAll(prop[X2[X2[Int, String], Boolean]] _))
    check(forAll(prop[Tuple1[Option[Int]]] _))

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Char] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[String] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[Option[Int]] _))
    check(forAll(prop[Option[Long]] _))
    check(forAll(prop[Option[Double]] _))
    check(forAll(prop[Option[Float]] _))
    check(forAll(prop[Option[Short]] _))
    check(forAll(prop[Option[Byte]] _))
    check(forAll(prop[Option[Boolean]] _))
    check(forAll(prop[Option[String]] _))
    check(forAll(prop[Option[SQLDate]] _))
    check(forAll(prop[Option[SQLTimestamp]] _))

    check(forAll(prop[Vector[Int]] _))
    check(forAll(prop[List[Int]] _))
    check(forAll(prop[Seq[Int]] _))
    check(forAll(prop[Vector[Char]] _))
    check(forAll(prop[List[Char]] _))
    check(forAll(prop[Seq[Char]] _))
    check(forAll(prop[Set[Char]] _))
    check(forAll(prop[Seq[Seq[Seq[Char]]]] _))
    check(forAll(prop[Seq[Option[String]]] _))
    check(forAll(prop[Seq[Map[String, Long]]] _))
    check(forAll(prop[Seq[Map[String, X2[Option[Long], Vector[Boolean]]]]] _))
    check(forAll(prop[Option[Int]] _))
    check(forAll(prop[Vector[X2[Int, Int]]] _))

    check(forAll(prop[X1[Vector[Food]]] _))
    check(forAll(prop[X1[Vector[X1[Food]]]] _))
    check(forAll(prop[X1[Vector[X1[Int]]]] _))

    // TODO this doesn't work, and never worked...
    // check(forAll(prop[X1[Option[X1[Option[Int]]]]] _))

    check(forAll(prop[UdtEncodedClass] _))
    check(forAll(prop[Option[UdtEncodedClass]] _))
    check(forAll(prop[X1[UdtEncodedClass]] _))
    check(forAll(prop[X2[Int, UdtEncodedClass]] _))
    check(forAll(prop[(Long, UdtEncodedClass)] _))
  }
}

object CollectTests {
  import frameless.syntax._

  def prop[A: TypedEncoder : ClassTag](data: Vector[A])(implicit c: SparkSession): Prop =
    TypedDataset.create(data).collect().run().toVector ?= data

  def propArray[A: TypedEncoder : ClassTag](data: Vector[X1[Array[A]]])(implicit c: SparkSession): Prop =
    Prop(TypedDataset.create(data).collect().run().toVector.zip(data).forall {
      case (X1(l), X1(r)) => l.sameElements(r)
    })
}


================================================
FILE: dataset/src/test/scala/frameless/ColumnTests.scala
================================================
package frameless

import java.util.Date
import java.math.BigInteger

import java.time.{ Instant, LocalDate, Period, Duration }
import java.time.temporal.ChronoUnit

import java.sql.{ Date => SqlDate, Timestamp }

import scala.math.Ordering.Implicits._
import scala.util.Try

import org.scalacheck.{ Arbitrary, Gen, Prop }, Arbitrary.arbitrary, Prop._

import org.scalatest.matchers.should.Matchers

import shapeless.test.illTyped

final class ColumnTests extends TypedDatasetSuite with Matchers {

  implicit val timestampArb: Arbitrary[Timestamp] = Arbitrary {
    OrderingImplicits.arbInstant.arbitrary.map { i =>
      Timestamp from i.truncatedTo(ChronoUnit.MILLIS)
    }
  }

  implicit val dateArb: Arbitrary[Date] = Arbitrary {
    OrderingImplicits.arbInstant.arbitrary.map(Date from _)
  }

  private implicit object OrderingImplicits {
    implicit val sqlDateOrdering: Ordering[SQLDate] = Ordering.by(_.days)

    implicit val sqlTimestmapOrdering: Ordering[SQLTimestamp] =
      Ordering.by(_.us)

    implicit val periodOrdering: Ordering[Period] =
      Ordering.by(p => (p.getYears, p.getMonths, p.getDays))

    /**
     * DateTimeUtils.instantToMicros supports dates starting 1970-01-01T00:00:00Z, which is Instant.EPOCH.
     * This function also overflows on Instant.MAX, to be sure it never overflows we use Instant.MAX / 4.
     * For implementation details check the org.apache.spark.sql.catalyst.util.DateTimeUtils.instantToMicros function details.
     */
    val genInstant = Gen.choose[Instant](
      Instant.EPOCH,
      Instant.ofEpochMilli(Instant.MAX.getEpochSecond / 4)
    )
    implicit val arbInstant: Arbitrary[Instant] = Arbitrary(genInstant)

    implicit val arbDuration: Arbitrary[Duration] = Arbitrary(
      genInstant.map(i => Duration.ofMillis(i.toEpochMilli))
    )

    implicit val arbPeriod: Arbitrary[Period] = Arbitrary(
      Gen.chooseNum(0, Int.MaxValue).map(l => Period.of(l, l, l))
    )
  }

  test("select('a < 'b, 'a <= 'b, 'a > 'b, 'a >= 'b)") {
    import OrderingImplicits._

    def prop[A: TypedEncoder: CatalystOrdered: Ordering](a: A, b: A): Prop = {
      val dataset = TypedDataset.create(X2(a, b) :: Nil)
      val A = dataset.col('a)
      val B = dataset.col('b)

      val dataset2 = dataset
        .selectMany(
          A < B,
          A < b, // One test uses columns, other uses literals
          A <= B,
          A <= b,
          A > B,
          A > b,
          A >= B,
          A >= b
        )
        .collect()
        .run()
        .toVector

      dataset2 ?= Vector(
        (a < b, a < b, a <= b, a <= b, a > b, a > b, a >= b, a >= b)
      )
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Instant] _))
    check(forAll(prop[Duration] _))
    check(forAll(prop[Period] _))
  }

  test("between") {
    import OrderingImplicits._
    def prop[A: TypedEncoder: CatalystOrdered: Ordering](
        a: A,
        b: A,
        c: A
      ): Prop = {
      val dataset = TypedDataset.create(X3(a, b, c) :: Nil)
      val A = dataset.col('a)
      val B = dataset.col('b)
      val C = dataset.col('c)

      val isBetweeen = dataset
        .selectMany(A.between(B, C), A.between(b, c))
        .collect()
        .run()
        .toVector
      val result = b <= a && a <= c

      isBetweeen ?= Vector((result, result))
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Instant] _))
    check(forAll(prop[Duration] _))
    check(forAll(prop[Period] _))
  }

  test("toString") {
    val t = TypedDataset.create((1, 2) :: Nil)
    t('_1).toString ?= t.dataset.col("_1").toString()
  }

  test("boolean and / or") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (s: Seq[X3[Boolean, Boolean, Boolean]]) =>
        val ds = TypedDataset.create(s)

        val typedBoolean = ds
          .select(
            ds('a) && ds('b) || ds('c),
            ds('a).and(ds('b)).or(ds('c))
          )
          .collect()
          .run()
          .toList

        val untypedDs = ds.toDF()
        val untypedBoolean = untypedDs
          .select(
            untypedDs("a") && untypedDs("b") || untypedDs("c"),
            untypedDs("a").and(untypedDs("b")).or(untypedDs("c"))
          )
          .as[(Boolean, Boolean)]
          .collect()
          .toList

        typedBoolean ?= untypedBoolean
      }
    }
  }

  test("substr") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (a: String, b: Int, c: Int) =>
        val ds = TypedDataset.create(X3(a, b, c) :: Nil)

        val typedSubstr =
          ds.select(ds('a).substr(ds('b), ds('c))).collect().run().toList

        val untypedDs = ds.toDF()
        val untypedSubstr = untypedDs
          .select(untypedDs("a").substr(untypedDs("b"), untypedDs("c")))
          .as[String]
          .collect()
          .toList

        typedSubstr ?= untypedSubstr
      }
    }

    check {
      forAll { (a: String, b: Int, c: Int) =>
        val ds = TypedDataset.create(X1(a) :: Nil)

        val typedSubstr = ds.select(ds('a).substr(b, c)).collect().run().toList

        val untypedDs = ds.toDF()
        val untypedSubstr = untypedDs
          .select(untypedDs("a").substr(b, c))
          .as[String]
          .collect()
          .toList

        typedSubstr ?= untypedSubstr
      }
    }

    val ds1 = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds1.select(ds1('_1).substr(0, 5))""")
    illTyped("""ds1.select(ds1('_2).substr(0, 5))""")
    illTyped("""ds1.select(ds1('_3).substr(0, 5))""")
    illTyped("""ds1.select(ds1('_1).substr(ds1('_2), ds1('_3)))""")
  }

  test("like") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (a: String, b: String) =>
        val ds = TypedDataset.create(X2(a, b) :: Nil)

        val typedLike =
          ds.select(ds('a).like(a), ds('b).like(a)).collect().run().toList

        val untypedDs = ds.toDF()
        val untypedLike = untypedDs
          .select(untypedDs("a").like(a), untypedDs("b").like(a))
          .as[(Boolean, Boolean)]
          .collect()
          .toList

        typedLike ?= untypedLike
      }
    }

    val ds = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds.select(ds('_1).like("foo"))""")
    illTyped("""ds.select(ds('_2).like("foo"))""")
    illTyped("""ds.select(ds('_3).like("foo"))""")
  }

  test("rlike") {
    val spark = session
    import spark.implicits._

    val regex = Gen.nonEmptyListOf(arbitrary[Char]).map(_.mkString).suchThat {
      str => Try(str.r).isSuccess
    }

    check {
      forAll(regex, arbitrary[String]) { (a, b) =>
        val ds = TypedDataset.create(X2(a, b) :: Nil)

        val typedLike = ds
          .select(ds('a).rlike(a), ds('b).rlike(a), ds('a).rlike(".*"))
          .collect()
          .run()
          .toList

        val untypedDs = ds.toDF()
        val untypedLike = untypedDs
          .select(
            untypedDs("a").rlike(a),
            untypedDs("b").rlike(a),
            untypedDs("a").rlike(".*")
          )
          .as[(Boolean, Boolean, Boolean)]
          .collect()
          .toList

        typedLike ?= untypedLike
      }
    }

    val ds = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds.select(ds('_1).rlike("foo"))""")
    illTyped("""ds.select(ds('_2).rlike("foo"))""")
    illTyped("""ds.select(ds('_3).rlike("foo"))""")
  }

  test("contains") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (a: String, b: String) =>
        val ds = TypedDataset.create(X2(a, b) :: Nil)

        val typedContains = ds
          .select(ds('a).contains(ds('b)), ds('b).contains(a))
          .collect()
          .run()
          .toList

        val untypedDs = ds.toDF()
        val untypedContains = untypedDs
          .select(
            untypedDs("a").contains(untypedDs("b")),
            untypedDs("b").contains(a)
          )
          .as[(Boolean, Boolean)]
          .collect()
          .toList

        typedContains ?= untypedContains
      }
    }

    val ds = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds.select(ds('_1).contains("foo"))""")
    illTyped("""ds.select(ds('_2).contains("foo"))""")
    illTyped("""ds.select(ds('_3).contains("foo"))""")
  }

  test("startsWith") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (a: String, b: String) =>
        val ds = TypedDataset.create(X2(a, b) :: Nil)

        val typedStartsWith = ds
          .select(ds('a).startsWith(ds('b)), ds('b).startsWith(a))
          .collect()
          .run()
          .toList

        val untypedDs = ds.toDF()
        val untypedStartsWith = untypedDs
          .select(
            untypedDs("a").startsWith(untypedDs("b")),
            untypedDs("b").startsWith(a)
          )
          .as[(Boolean, Boolean)]
          .collect()
          .toList

        typedStartsWith ?= untypedStartsWith
      }
    }

    val ds = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds.select(ds('_1).startsWith("foo"))""")
    illTyped("""ds.select(ds('_2).startsWith("foo"))""")
    illTyped("""ds.select(ds('_3).startsWith("foo"))""")
  }

  test("endsWith") {
    val spark = session
    import spark.implicits._

    check {
      forAll { (a: String, b: String) =>
        val ds = TypedDataset.create(X2(a, b) :: Nil)
        val typedStartsWith = ds
          .select(ds('a).endsWith(ds('b)), ds('b).endsWith(a))
          .collect()
          .run()
          .toList

        val untypedDs = ds.toDF()
        val untypedStartsWith = untypedDs
          .select(
            untypedDs("a").endsWith(untypedDs("b")),
            untypedDs("b").endsWith(a)
          )
          .as[(Boolean, Boolean)]
          .collect()
          .toList

        typedStartsWith ?= untypedStartsWith
      }
    }

    val ds = TypedDataset.create((1, false, 2.0) :: Nil)
    illTyped("""ds.select(ds('_1).endsWith("foo"))""")
    illTyped("""ds.select(ds('_2).endsWith("foo"))""")
    illTyped("""ds.select(ds('_3).endsWith("foo"))""")
  }

  test("getOrElse") {
    def prop[A: TypedEncoder](a: A, opt: Option[A]) = {
      val dataset = TypedDataset.create(X2(a, opt) :: Nil)

      val defaulted: (A, A) = dataset
        .select(dataset('b).getOrElse(dataset('a)), dataset('b).getOrElse(a))
        .collect()
        .run()
        .toList
        .head

      defaulted ?= (opt.getOrElse(a) -> opt.getOrElse(a))
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[Date] _))
    check(forAll(prop[Timestamp] _))
    check(forAll(prop[String] _))

    // Scalacheck is too slow
    check(prop[BigInt](BigInt(Long.MaxValue).+(BigInt(Long.MaxValue)), None))
    check(prop[BigInt](BigInt("0"), Some(BigInt(Long.MaxValue))))
    check(
      prop[BigInt](
        BigInt(Long.MinValue).-(BigInt(Long.MinValue)),
        Some(BigInt("0"))
      )
    )

    check(
      prop[BigInteger](
        BigInteger
          .valueOf(Long.MaxValue)
          .add(BigInteger.valueOf(Long.MaxValue)),
        None
      )
    )

    check(
      prop[BigInteger](
        BigInteger.valueOf(0L),
        Some(BigInteger.valueOf(Long.MaxValue))
      )
    )

    check(
      prop[BigInteger](
        BigInteger
          .valueOf(Long.MinValue)
          .subtract(BigInteger.valueOf(Long.MinValue)),
        Some(BigInteger.valueOf(0L))
      )
    )
  }

  test("Consistency with Spark internal date/time representation") {
    val ts = Timestamp.from(Instant parse "1990-01-01T01:00:00.000Z")
    val date = Date.from(Instant parse "1991-01-01T02:00:00.000Z")

    val sqlDate = SqlDate.valueOf(LocalDate parse "1991-02-01")

    val input = Seq(X3(ts, date, sqlDate))

    val ds: TypedDataset[X3[Timestamp, Date, SqlDate]] =
      TypedDataset.create(input)

    val result1: Seq[(Timestamp, Date, SqlDate)] =
      ds.dataset.toDF
        .collect()
        .map { row =>
          Tuple3(
            row.getTimestamp(0),
            Date.from(row.getTimestamp(1).toInstant),
            row.getDate(2)
          )
        }
        .toSeq

    result1 shouldEqual Seq(Tuple3(ts, date, sqlDate))

    val result2: Seq[X3[Timestamp, Date, SqlDate]] =
      ds.collect.run().toSeq

    result2 shouldEqual input
  }

  test("asCol") {
    def prop[A: TypedEncoder, B: TypedEncoder](a: Seq[X2[A, B]]) = {
      val ds: TypedDataset[X2[A, B]] = TypedDataset.create(a)

      val frameless: Seq[(A, X2[A, B], X2[A, B], X2[A, B], B)] =
        ds.select(ds('a), ds.asCol, ds.asCol, ds.asCol, ds('b)).collect().run()

      val scala: Seq[(A, X2[A, B], X2[A, B], X2[A, B], B)] =
        a.map(x => (x.a, x, x, x, x.b))

      scala ?= frameless
    }

    check(forAll(prop[Int, Option[Long]] _))
    check(forAll(prop[Vector[Char], Option[Boolean]] _))
    check(forAll(prop[Vector[Vector[String]], Vector[Vector[BigDecimal]]] _))
  }

  test("asCol single column TypedDatasets") {
    def prop[A: TypedEncoder](a: Seq[A]) = {
      val ds: TypedDataset[A] = TypedDataset.create(a)

      val frameless: Seq[(A, A, A)] =
        ds.select(ds.asCol, ds.asCol, ds.asCol).collect().run()

      val scala: Seq[(A, A, A)] =
        a.map(x => (x, x, x))

      scala ?= frameless
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Date] _))
    check(forAll(prop[Vector[Vector[String]]] _))
  }

  test("asCol with numeric operators") {
    def prop(a: Seq[Long]) = {
      val ds: TypedDataset[Long] = TypedDataset.create(a)
      val (first, second) = (2L, 5L)
      val frameless: Seq[(Long, Long, Long)] =
        ds.select(ds.asCol, ds.asCol + first, ds.asCol * second).collect().run()

      val scala: Seq[(Long, Long, Long)] =
        a.map(x => (x, x + first, x * second))

      scala ?= frameless
    }

    check(forAll(prop _))
  }

  test("reference Value class so can join on") {
    import RecordEncoderTests.{ Name, Person }

    val bar = new Name("bar")

    val ds1: TypedDataset[Person] =
      TypedDataset.create(Seq(Person(bar, 23), Person(new Name("foo"), 11)))

    val ds2: TypedDataset[Name] =
      TypedDataset.create(Seq(new Name("lorem"), bar))

    val joined = ds1.joinLeftSemi(ds2)(ds1.col('name) === ds2.asJoinColValue)

    joined.collect().run() shouldEqual Seq(Person(bar, 23))
  }

  test("unary_!") {
    val ds = TypedDataset.create((true, false) :: Nil)

    val rs = ds.select(!ds('_1), !ds('_2)).collect().run().head

    rs shouldEqual (false -> true)
  }

  test("unary_! with non-boolean columns should not compile") {
    val ds = TypedDataset.create((1, "a", 2.0) :: Nil)

    "ds.select(!ds('_1))" shouldNot typeCheck
    "ds.select(!ds('_2))" shouldNot typeCheck
    "ds.select(!ds('_3))" shouldNot typeCheck
  }

  test("opt") {
    val data = (Option(1L), Option(2L)) :: (None, None) :: Nil
    val ds = TypedDataset.create(data)
    val rs =
      ds.select(ds('_1).opt.map(_ * 2), ds('_1).opt.map(_ + 2)).collect().run()
    val expected = data.map { case (x, y) => (x.map(_ * 2), y.map(_ + 1)) }
    rs shouldEqual expected
  }

  test("opt compiles only for columns of type Option[_]") {
    val ds = TypedDataset.create((1, List(1, 2, 3)) :: Nil)
    "ds.select(ds('_1).opt.map(x => x))" shouldNot typeCheck
    "ds.select(ds('_2).opt.map(x => x))" shouldNot typeCheck
  }

  test("field") {
    val ds = TypedDataset.create((1, (2.3F, "a")) :: Nil)
    val rs = ds.select(ds('_2).field('_2)).collect().run()

    rs shouldEqual Seq("a")
  }

  test("field compiles only for valid field") {
    val ds = TypedDataset.create((1, (2.3F, "a")) :: Nil)

    "ds.select(ds('_2).field('_3))" shouldNot typeCheck
  }

  test("col through lambda") {
    case class MyClass1(a: Int, b: String, c: MyClass2)
    case class MyClass2(d: Long)

    val ds = TypedDataset.create(
      Seq(MyClass1(1, "2", MyClass2(3L)), MyClass1(4, "5", MyClass2(6L)))
    )

    assert(ds.col(_.a).isInstanceOf[TypedColumn[MyClass1, Int]])
    assert(ds.col(_.b).isInstanceOf[TypedColumn[MyClass1, String]])
    assert(ds.col(_.c.d).isInstanceOf[TypedColumn[MyClass1, Long]])

    "ds.col(_.c.toString)" shouldNot typeCheck
    "ds.col(_.c.toInt)" shouldNot typeCheck
    "ds.col(x => java.lang.Math.abs(x.a))" shouldNot typeCheck

    // we should be able to block the following as well...
    "ds.col(_.a.toInt)" shouldNot typeCheck
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ColumnViaLambdaTests.scala
================================================
package frameless

import org.scalatest.matchers.should.Matchers
import shapeless.test.illTyped

case class MyClass1(a: Int, b: String, c: MyClass2, g: Option[MyClass4])
case class MyClass2(d: Long, e: MyClass3)
case class MyClass3(f: Double)
case class MyClass4(h: Boolean)

final class ColumnViaLambdaTests extends TypedDatasetSuite with Matchers {

  def ds = {
    TypedDataset.create(Seq(
      MyClass1(1, "2", MyClass2(3L, MyClass3(7.0D)), Some(MyClass4(true))),
      MyClass1(4, "5", MyClass2(6L, MyClass3(8.0D)), None)))
  }

  test("col(_.a)") {
    val col = TypedColumn[MyClass1, Int](_.a)

    ds.select(col).collect().run() shouldEqual Seq(1, 4)
  }

  test("col(x => x.a") {
    val col = TypedColumn[MyClass1, Int](x => x.a)

    ds.select(col).collect().run() shouldEqual Seq(1, 4)
  }

  test("col((x: MyClass1) => x.a") {
    val col = TypedColumn { (x: MyClass1) => x.a }

    ds.select(col).collect().run() shouldEqual Seq(1, 4)
  }

  test("col((x: MyClass1) => x.c.e.f") {
    val col = TypedColumn { (x: MyClass1) => x.c.e.f }

    ds.select(col).collect().run() shouldEqual Seq(7.0D, 8.0D)
  }

  test("col(_.c.d)") {
    val col = TypedColumn[MyClass1, Long](_.c.d)

    ds.select(col).collect().run() shouldEqual Seq(3L, 6L)
  }

  test("col(_.c.e.f)") {
    val col = TypedColumn[MyClass1, Double](_.c.e.f)

    ds.select(col).collect().run() shouldEqual Seq(7.0D, 8.0D)
  }

  test("col(_.c.d) as int does not compile (is long)") {
    illTyped("TypedColumn[MyClass1, Int](_.c.d)")
  }

  test("col(_.g.h does not compile") {
    val col = ds.col(_.g) // the path "ends" at .g (can't access h)
    illTyped("""ds.col(_.g.h)""")
  }

  test("col(_.a.toString) does not compile") {
    illTyped("""ds.col(_.a.toString)""")
  }

  test("col(_.a.toString.size) does not compile") {
    illTyped("""ds.col(_.a.toString.size)""")
  }

  test("col((x: MyClass1) => x.toString.size) does not compile") {
    illTyped("""ds.col((x: MyClass1) => x.toString.size)""")
  }

  test("col(x => java.lang.Math.abs(x.a)) does not compile") {
    illTyped("""col(x => java.lang.Math.abs(x.a))""")
  }
}


================================================
FILE: dataset/src/test/scala/frameless/CreateTests.scala
================================================
package frameless

import org.scalacheck.{Arbitrary, Prop}
import org.scalacheck.Prop._

import scala.reflect.ClassTag
import shapeless.test.illTyped
import org.scalatest.matchers.should.Matchers

class CreateTests extends TypedDatasetSuite with Matchers {

  import TypedEncoder.usingInjection

  test("creation using X4 derived DataFrames") {
    def prop[
    A: TypedEncoder,
    B: TypedEncoder,
    C: TypedEncoder,
    D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = {
      val ds = TypedDataset.create(data)
      TypedDataset.createUnsafe[X4[A, B, C, D]](ds.toDF()).collect().run() ?= data
    }

    check(forAll(prop[Int, Char, X2[Option[Country], Country], Int] _))
    check(forAll(prop[X2[Int, Int], Int, Boolean, Vector[Food]] _))
    check(forAll(prop[String, Food, X3[Food, Country, Boolean], Int] _))
    check(forAll(prop[String, Food, X3U[Food, Country, Boolean], Int] _))
    check(forAll(prop[
      Option[Vector[Food]],
      Vector[Vector[X2[Vector[(Person, X1[Char])], Country]]],
      X3[Food, Country, String],
      Vector[(Food, Country)]] _))
  }

  test("array fields") {
    def prop[T: Arbitrary: TypedEncoder: ClassTag] = forAll {
      (d1: Array[T], d2: Array[Option[T]], d3: Array[X1[T]], d4: Array[X1[Option[T]]],
        d5: X1[Array[T]]) =>
        TypedDataset.create(Seq(d1)).collect().run().head.sameElements(d1) &&
        TypedDataset.create(Seq(d2)).collect().run().head.sameElements(d2) &&
        TypedDataset.create(Seq(d3)).collect().run().head.sameElements(d3) &&
        TypedDataset.create(Seq(d4)).collect().run().head.sameElements(d4) &&
        TypedDataset.create(Seq(d5)).collect().run().head.a.sameElements(d5.a)
    }

    check(prop[Boolean])
    check(prop[Byte])
    check(prop[Short])
    check(prop[Int])
    check(prop[Long])
    check(prop[Float])
    check(prop[Double])
    check(prop[String])
  }

  test("vector fields") {
    def prop[T: Arbitrary: TypedEncoder] = forAll {
      (d1: Vector[T], d2: Vector[Option[T]], d3: Vector[X1[T]], d4: Vector[X1[Option[T]]],
        d5: X1[Vector[T]]) =>
      (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) &&
      (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) &&
      (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) &&
      (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) &&
      (TypedDataset.create(Seq(d5)).collect().run().head ?= d5)
    }

    check(prop[Boolean])
    check(prop[Byte])
    check(prop[Char])
    check(prop[Short])
    check(prop[Int])
    check(prop[Long])
    check(prop[Float])
    check(prop[Double])
    check(prop[String])
  }

  test("list fields") {
    def prop[T: Arbitrary: TypedEncoder] = forAll {
      (d1: List[T], d2: List[Option[T]], d3: List[X1[T]], d4: List[X1[Option[T]]],
        d5: X1[List[T]]) =>
      (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) &&
        (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) &&
        (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) &&
        (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) &&
        (TypedDataset.create(Seq(d5)).collect().run().head ?= d5)
    }

    check(prop[Boolean])
    check(prop[Byte])
    check(prop[Char])
    check(prop[Short])
    check(prop[Int])
    check(prop[Long])
    check(prop[Float])
    check(prop[Double])
    check(prop[String])
  }

  test("Map fields (scala.Predef.Map / scala.collection.immutable.Map)") {
    def prop[A: Arbitrary: NotCatalystNullable: TypedEncoder, B: Arbitrary: NotCatalystNullable: TypedEncoder] = forAll {
      (d1: Map[A, B], d2: Map[B, A], d3: Map[A, Option[B]],
        d4: Map[A, X1[B]], d5: Map[X1[A], B], d6: Map[X1[A], X1[B]]) =>

      (TypedDataset.create(Seq(d1)).collect().run().head ?= d1) &&
      (TypedDataset.create(Seq(d2)).collect().run().head ?= d2) &&
      (TypedDataset.create(Seq(d3)).collect().run().head ?= d3) &&
      (TypedDataset.create(Seq(d4)).collect().run().head ?= d4) &&
      (TypedDataset.create(Seq(d5)).collect().run().head ?= d5) &&
      (TypedDataset.create(Seq(d6)).collect().run().head ?= d6)
    }

    check(prop[String, String])
    check(prop[String, Boolean])
    check(prop[String, Byte])
    check(prop[String, Char])
    check(prop[String, Short])
    check(prop[String, Int])
    check(prop[String, Long])
    check(prop[String, Float])
    check(prop[String, Double])
  }

  test("maps with Option keys should not resolve the TypedEncoder") {
    val data: Seq[Map[Option[Int], Int]] = Seq(Map(Some(5) -> 5))
    illTyped("TypedDataset.create(data)", ".*could not find implicit value for parameter encoder.*")
  }

  test("not aligned columns should throw an exception") {
    val v = Vector(X2(1,2))
    val df = TypedDataset.create(v).dataset.toDF()

    a [IllegalStateException] should be thrownBy {
      TypedDataset.createUnsafe[X1[Int]](df).show().run()
    }
  }

  test("dataset with different column order") {
    // e.g. when loading data from partitioned dataset
    // the partition columns get appended to the end of the underlying relation
    def prop[A: Arbitrary: TypedEncoder, B: Arbitrary: TypedEncoder] = forAll {
      (a1: A, b1: B) => {
        val ds = TypedDataset.create(
          Vector((b1, a1))
        ).dataset.toDF("b", "a").as[X2[A, B]](TypedExpressionEncoder[X2[A, B]])
        TypedDataset.create(ds).collect().run().head ?= X2(a1, b1)

      }
    }
    check(prop[X1[Double], X1[X1[SQLDate]]])
    check(prop[String, Int])
  }
}


================================================
FILE: dataset/src/test/scala/frameless/DropTest.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped

class DropTest extends TypedDatasetSuite {
  import DropTest._

  test("fail to compile on missing value") {
    val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XMissing] = f.drop[XMissing]('j)"""
    }
  }

  test("fail to compile on different column name") {
    val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XDifferentColumnName] = f.drop[XDifferentColumnName]('j)"""
    }
  }

  test("fail to compile on added column name") {
    val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XAdded] = f.drop[XAdded]('j)"""
    }
  }

  test("remove column in the middle") {
    val f: TypedDataset[X] = TypedDataset.create(X(1, 1, false) :: X(1, 1, false) :: X(1, 10, false) :: Nil)
    val fNew: TypedDataset[XGood] = f.drop[XGood]

    fNew.collect().run().foreach(xg => assert(xg === XGood(1, false)))
  }

  test("drop four columns") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil)
      val d4 = d5.drop[X4[A, A, A, A]]
      val d3 = d4.drop[X3[A, A, A]]
      val d2 = d3.drop[X2[A, A]]
      val d1 = d2.drop[X1[A]]

      X1(value) ?= d1.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }
}

object DropTest {
  case class X(i: Int, j: Int, k: Boolean)
  case class XMissing(i: Int)
  case class XDifferentColumnName(ij: Int, k: Boolean)
  case class XAdded(i: Int, j: Int, k: Boolean, l: Int)
  case class XGood(i: Int, k: Boolean)
}


================================================
FILE: dataset/src/test/scala/frameless/DropTupledTest.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class DropTupledTest extends TypedDatasetSuite {
  test("drop five columns") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d5 = TypedDataset.create(X5(value, value, value, value, value) :: Nil)
      val d4 = d5.dropTupled('a) //drops first column
      val d3 = d4.dropTupled('_4) //drops last column
      val d2 = d3.dropTupled('_2) //drops middle column
      val d1 = d2.dropTupled('_2)

      Tuple1(value) ?= d1.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }

  test("drop first column") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
      val d2 = d3.dropTupled('a)

      (value, value) ?= d2.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }

  test("drop middle column") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
      val d2 = d3.dropTupled('b)

      (value, value) ?= d2.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }

  test("drop last column") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d3 = TypedDataset.create(X3(value, value, value) :: Nil)
      val d2 = d3.dropTupled('c)

      (value, value) ?= d2.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }
}


================================================
FILE: dataset/src/test/scala/frameless/EncoderTests.scala
================================================
package frameless

import scala.collection.immutable.Set

import org.scalatest.matchers.should.Matchers

object EncoderTests {
  case class Foo(s: Seq[(Int, Int)])
  case class Bar(s: Set[(Int, Int)])
  case class InstantRow(i: java.time.Instant)
  case class DurationRow(d: java.time.Duration)
  case class PeriodRow(p: java.time.Period)
}

class EncoderTests extends TypedDatasetSuite with Matchers {
  import EncoderTests._

  test("It should encode deeply nested collections") {
    implicitly[TypedEncoder[Seq[Foo]]]
    implicitly[TypedEncoder[Seq[Bar]]]
    implicitly[TypedEncoder[Set[Foo]]]
  }

  test("It should encode java.time.Instant") {
    implicitly[TypedEncoder[InstantRow]]
  }

  test("It should encode java.time.Duration") {
    implicitly[TypedEncoder[DurationRow]]
  }

  test("It should encode java.time.Period") {
    implicitly[TypedEncoder[PeriodRow]]
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ExplodeTests.scala
================================================
package frameless

import frameless.functions.CatalystExplodableCollection
import org.scalacheck.{Arbitrary, Prop}
import org.scalacheck.Prop.forAll
import org.scalacheck.Prop._

import scala.reflect.ClassTag

class ExplodeTests extends TypedDatasetSuite {
  test("simple explode test") {
    val ds = TypedDataset.create(Seq((1,Array(1,2))))
    ds.explode('_2): TypedDataset[(Int,Int)]
  }

  test("explode on vectors/list/seq") {
    def prop[F[X] <: Traversable[X] : CatalystExplodableCollection, A: TypedEncoder](xs: List[X1[F[A]]])(implicit arb: Arbitrary[F[A]], enc: TypedEncoder[F[A]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.explode('a).collect().run().toVector
      val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Vector, Long] _))
    check(forAll(prop[Seq, Int] _))
    check(forAll(prop[Vector, Char] _))
    check(forAll(prop[Vector, String] _))
    check(forAll(prop[List, Long] _))
    check(forAll(prop[List, Int] _))
    check(forAll(prop[List, Char] _))
    check(forAll(prop[List, String] _))
  }

  test("explode on arrays") {
    def prop[A: TypedEncoder: ClassTag](xs: List[X1[Array[A]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.explode('a).collect().run().toVector
      val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("explode on maps") {
    def prop[A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X1[Map[A, B]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.explodeMap('a).collect().run().toVector
      val scalaResults = xs.flatMap(_.a.toList).map(t => Tuple1(Tuple2(t._1, t._2))).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Long, String] _))
    check(forAll(prop[Int, Long] _))
    check(forAll(prop[String, Int] _))
  }

  test("explode on maps preserving other columns") {
    def prop[K: TypedEncoder: ClassTag, A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X2[K, Map[A, B]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.explodeMap('b).collect().run().toVector
      val scalaResults = xs.flatMap { x2 => x2.b.toList.map((x2.a, _)) }.toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Int, Long, String] _))
    check(forAll(prop[String, Int, Long] _))
    check(forAll(prop[Long, String, Int] _))
  }

  test("explode on maps making sure no key / value naming collision happens") {
    def prop[K: TypedEncoder: ClassTag, V: TypedEncoder: ClassTag, A: TypedEncoder: ClassTag, B: TypedEncoder: ClassTag](xs: List[X3KV[K, V, Map[A, B]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.explodeMap('c).collect().run().toVector
      val scalaResults = xs.flatMap { x3 => x3.c.toList.map((x3.key, x3.value, _)) }.toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[String, Int, Long, String] _))
    check(forAll(prop[Long, String, Int, Long] _))
    check(forAll(prop[Int, Long, String, Int] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/FilterTests.scala
================================================
package frameless

import org.scalatest.matchers.should.Matchers

import org.scalacheck.Prop
import org.scalacheck.Prop._

final class FilterTests extends TypedDatasetSuite with Matchers {
  test("filter('a == lit(b))") {
    def prop[A: TypedEncoder](elem: A, data: Vector[X1[A]])(implicit ex1: TypedEncoder[X1[A]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col('a)

      val dataset2 = dataset.filter(A === elem).collect().run().toVector
      val data2 = data.filter(_.a == elem)

      dataset2 ?= data2
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("filter('a =!= lit(b))") {
    def prop[A: TypedEncoder](elem: A, data: Vector[X1[A]])(implicit ex1: TypedEncoder[X1[A]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col('a)

      val dataset2 = dataset.filter(A =!= elem).collect().run().toVector
      val data2 = data.filter(_.a != elem)

      dataset2 ?= data2
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Char] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[Vector[SQLTimestamp]] _))
  }

  test("filter('a =!= 'b)") {
    def prop[A: TypedEncoder](data: Vector[X2[A, A]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col('a)
      val B = dataset.col('b)

      val dataset2 = dataset.filter(A =!= B).collect().run().toVector
      val data2 = data.filter(x => x.a != x.b)

      dataset2 ?= data2
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Char] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[Vector[SQLTimestamp]] _))
  }

  test("filter('a =!= 'b") {
    def prop[A: TypedEncoder](elem: A, data: Vector[X2[A,A]]): Prop = {
      val dataset = TypedDataset.create(data)
      val cA = dataset.col('a)
      val cB = dataset.col('b)

      val dataset2 = dataset.filter(cA =!= cB).collect().run().toVector
      val data2 = data.filter(x => x.a != x.b )

      (dataset2 ?= data2).&&(dataset.filter(cA =!= cA).count().run() ?= 0)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Char] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[Vector[SQLTimestamp]] _))
  }

  test("filter with arithmetic expressions: addition") {
    check(forAll { (data: Vector[X1[Int]]) =>
      val ds = TypedDataset.create(data)
      val res = ds.filter((ds('a) + 1) === (ds('a) + 1)).collect().run().toVector
      res ?= data
    })
  }

  test("filter with values (not columns): addition") {
    check(forAll { (data: Vector[X1[Int]], const: Int) =>
      val ds = TypedDataset.create(data)
      val res = ds.filter(ds('a) > const).collect().run().toVector
      res ?= data.filter(_.a > const)
    })
  }

  test("filter with arithmetic expressions: multiplication") {
    val t = X1(1) :: X1(2) :: X1(3) :: Nil
    val tds: TypedDataset[X1[Int]] = TypedDataset.create(t)

    assert(tds.filter(tds('a) * 2 === 2).collect().run().toVector === Vector(X1(1)))
    assert(tds.filter(tds('a) * 3 === 3).collect().run().toVector === Vector(X1(1)))
  }

  test("Option equality/inequality for columns") {
    def prop[A <: Option[_] : TypedEncoder](a: A, b: A): Prop = {
      val data = X2(a, b) :: X2(a, a) :: Nil
      val dataset = TypedDataset.create(data)
      val A = dataset.col('a)
      val B = dataset.col('b)

      (data.filter(x => x.a == x.b).toSet ?= dataset.filter(A === B).collect().run().toSet).
        &&(data.filter(x => x.a != x.b).toSet ?= dataset.filter(A =!= B).collect().run().toSet).
        &&(data.filter(x => x.a == None).toSet ?= dataset.filter(A.isNone).collect().run().toSet).
        &&(data.filter(x => x.a == None).toSet ?= dataset.filter(A.isNotNone === false).collect().run().toSet)
    }

    check(forAll(prop[Option[Int]] _))
    check(forAll(prop[Option[Boolean]] _))
    check(forAll(prop[Option[SQLDate]] _))
    check(forAll(prop[Option[SQLTimestamp]] _))
    check(forAll(prop[Option[X1[String]]] _))
    check(forAll(prop[Option[X1[X1[String]]]] _))
    check(forAll(prop[Option[X1[X1[Vector[Option[Int]]]]]] _))
  }

  test("Option equality/inequality for lit") {
    def prop[A <: Option[_] : TypedEncoder](a: A, b: A, cLit: A): Prop = {
      val data = X2(a, b) :: X2(a, cLit) :: Nil
      val dataset = TypedDataset.create(data)
      val colA = dataset.col('a)

      (data.filter(x => x.a == cLit).toSet ?= dataset.filter(colA === cLit).collect().run().toSet).
        &&(data.filter(x => x.a != cLit).toSet ?= dataset.filter(colA =!= cLit).collect().run().toSet).
        &&(data.filter(x => x.a == None).toSet ?= dataset.filter(colA.isNone).collect().run().toSet).
        &&(data.filter(x => x.a == None).toSet ?= dataset.filter(colA.isNotNone === false).collect().run().toSet)
    }

    check(forAll(prop[Option[Int]] _))
    check(forAll(prop[Option[Boolean]] _))
    check(forAll(prop[Option[SQLDate]] _))
    check(forAll(prop[Option[SQLTimestamp]] _))
    check(forAll(prop[Option[String]] _))
    check(forAll(prop[Option[X1[String]]] _))
    check(forAll(prop[Option[X1[X1[String]]]] _))
    check(forAll(prop[Option[X1[X1[Vector[Option[Int]]]]]] _))
  }

  test("Option content filter") {
    val data = (Option(1L), Option(2L)) :: (Option(0L), Option(1L)) :: (None, None) :: Nil

    val ds = TypedDataset.create(data)

    val l = functions.lit[Long, (Option[Long], Option[Long])](0L)
    val exists = ds('_1).isSome[Long](_ <= l)
    val forall = ds('_1).isSomeOrNone[Long](_ <= l)

    ds.select(exists).collect().run() shouldEqual Seq(false, true, false)
    ds.select(forall).collect().run() shouldEqual Seq(false, true, true)

    ds.filter(exists).collect().run() shouldEqual Seq(Option(0L) -> Option(1L))

    ds.filter(forall).collect().run() shouldEqual Seq(
      Option(0L) -> Option(1L), (None -> None))
  }

  test("filter with isin values") {
    def prop[A: TypedEncoder](data: Vector[X1[A]], values: Vector[A])(implicit a : CatalystIsin[A]): Prop = {
      val ds = TypedDataset.create(data)
      val res = ds.filter(ds('a).isin(values:_*)).collect().run().toVector
      res ?= data.filter(d => values.contains(d.a))
    }

    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/FlattenTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop.forAll
import org.scalacheck.Prop._


class FlattenTests extends TypedDatasetSuite {
  test("simple flatten test") {
    val ds: TypedDataset[(Int,Option[Int])] = TypedDataset.create(Seq((1,Option(1))))
    ds.flattenOption('_2): TypedDataset[(Int,Int)]
  }

  test("different Optional types") {
    def prop[A: TypedEncoder](xs: List[X1[Option[A]]]): Prop = {
      val tds: TypedDataset[X1[Option[A]]] = TypedDataset.create(xs)

      val framelessResults: Seq[Tuple1[A]] = tds.flattenOption('a).collect().run().toVector
      val scalaResults = xs.flatMap(_.a).map(Tuple1(_)).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Char] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/GroupByTests.scala
================================================
package frameless

import frameless.functions.aggregate._
import org.scalacheck.Prop
import org.scalacheck.Prop._

class GroupByTests extends TypedDatasetSuite {
  test("groupByMany('a).agg(sum('b))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder,
      Out: TypedEncoder : Numeric
    ](data: List[X2[A, B]])(
      implicit
      summable: CatalystSummable[B, Out],
      widen: B => Out
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val datasetSumByA = dataset.groupByMany(A).agg(sum(B)).collect().run.toVector.sortBy(_._1)
      val sumByA = data.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).map(widen).sum }.toVector.sortBy(_._1)

      datasetSumByA ?= sumByA
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("agg(sum('a))") {
    def prop[A: TypedEncoder : Numeric](data: List[X1[A]])(
      implicit
      summable: CatalystSummable[A, A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetSum = dataset.agg(sum(A)).collect().run().toVector
      val listSum = data.map(_.a).sum

      datasetSum ?= Vector(listSum)
    }

    check(forAll(prop[Long] _))
  }

  test("agg(sum('a), sum('b))") {
    def prop[
      A: TypedEncoder : Numeric,
      B: TypedEncoder : Numeric
    ](data: List[X2[A, B]])(
      implicit
      as: CatalystSummable[A, A],
      bs: CatalystSummable[B, B]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val datasetSum = dataset.agg(sum(A), sum(B)).collect().run().toVector
      val listSumA = data.map(_.a).sum
      val listSumB = data.map(_.b).sum

      datasetSum ?= Vector((listSumA, listSumB))
    }

    check(forAll(prop[Long, Long] _))
  }

  test("agg(sum('a), sum('b), sum('c))") {
    def prop[
    A: TypedEncoder : Numeric,
    B: TypedEncoder : Numeric,
    C: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]])(
      implicit
      as: CatalystSummable[A, A],
      bs: CatalystSummable[B, B],
      cs: CatalystSummable[C, C]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val datasetSum = dataset.agg(sum(A), sum(B), sum(C)).collect().run().toVector
      val listSumA = data.map(_.a).sum
      val listSumB = data.map(_.b).sum
      val listSumC = data.map(_.c).sum

      datasetSum ?= Vector((listSumA, listSumB, listSumC))
    }

    check(forAll(prop[Long, Long, Long] _))
  }

  test("agg(sum('a), sum('b), min('c), max('d))") {
    def prop[
    A: TypedEncoder : Numeric,
    B: TypedEncoder : Numeric,
    C: TypedEncoder : Numeric,
    D: TypedEncoder : Numeric
    ](data: List[X4[A, B, C, D]])(
      implicit
      as: CatalystSummable[A, A],
      bs: CatalystSummable[B, B],
      co: CatalystOrdered[C],
      fo: CatalystOrdered[D]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)

      val datasetSum = dataset.agg(sum(A), sum(B), min(C), max(D)).collect().run().toVector
      val listSumA = data.map(_.a).sum
      val listSumB = data.map(_.b).sum
      val listMinC = if(data.isEmpty) implicitly[Numeric[C]].fromInt(0) else data.map(_.c).min
      val listMaxD = if(data.isEmpty) implicitly[Numeric[D]].fromInt(0) else data.map(_.d).max

      datasetSum ?= Vector(if (data.isEmpty) null else (listSumA, listSumB, listMinC, listMaxD))
    }

    check(forAll(prop[Long, Long, Long, Int] _))
    check(forAll(prop[Long, Long, Short, Short] _))
    check(forAll(prop[Long, Long, Double, BigDecimal] _))
  }

  test("groupBy('a).agg(sum('b))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder,
      Out: TypedEncoder : Numeric
    ](data: List[X2[A, B]])(
      implicit
      summable: CatalystSummable[B, Out],
      widen: B => Out
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val datasetSumByA = dataset.groupBy(A).agg(sum(B)).collect().run.toVector.sortBy(_._1)
      val sumByA = data.groupBy(_.a).mapValues(_.map(_.b).map(widen).sum).toVector.sortBy(_._1)

      datasetSumByA ?= sumByA
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("groupBy('a).mapGroups('a, sum('b))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder : Numeric
    ](data: List[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val datasetSumByA = dataset.groupBy(A)
        .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) }
        .collect().run().toVector.sortBy(_._1)
      val sumByA = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1)

      datasetSumByA ?= sumByA
    }

    check(forAll(prop[Int, Long] _))
  }

  test("groupBy('a).agg(sum('b), sum('c)) to groupBy('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder,
    C: TypedEncoder,
    OutB: TypedEncoder : Numeric,
    OutC: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]])(
      implicit
      summableB: CatalystSummable[B, OutB],
      summableC: CatalystSummable[C, OutC],
      widenb: B => OutB,
      widenc: C => OutC
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumBC = dataset
        .groupBy(A)
        .agg(sum(B), sum(C))
        .collect().run.toVector.sortBy(_._1)

      val scalaSumBC = data.groupBy(_.a).mapValues { xs =>
        (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum)
      }.toVector.map {
        case (a, (b, c)) => (a, b, c)
      }.sortBy(_._1)

      val framelessSumBCB = dataset
        .groupBy(A)
        .agg(sum(B), sum(C), sum(B))
        .collect().run.toVector.sortBy(_._1)

      val scalaSumBCB = data.groupBy(_.a).mapValues { xs =>
        (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum)
      }.toVector.map {
        case (a, (b1, c, b2)) => (a, b1, c, b2)
      }.sortBy(_._1)

      val framelessSumBCBC = dataset
        .groupBy(A)
        .agg(sum(B), sum(C), sum(B), sum(C))
        .collect().run.toVector.sortBy(_._1)

      val scalaSumBCBC = data.groupBy(_.a).mapValues { xs =>
        (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum)
      }.toVector.map {
        case (a, (b1, c1, b2, c2)) => (a, b1, c1, b2, c2)
      }.sortBy(_._1)

      val framelessSumBCBCB = dataset
        .groupBy(A)
        .agg(sum(B), sum(C), sum(B), sum(C), sum(B))
        .collect().run.toVector.sortBy(_._1)

      val scalaSumBCBCB = data.groupBy(_.a).mapValues { xs =>
        (xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum, xs.map(_.c).map(widenc).sum, xs.map(_.b).map(widenb).sum)
      }.toVector.map {
        case (a, (b1, c1, b2, c2, b3)) => (a, b1, c1, b2, c2, b3)
      }.sortBy(_._1)

      (framelessSumBC ?= scalaSumBC)
        .&&(framelessSumBCB ?= scalaSumBCB)
        .&&(framelessSumBCBC ?= scalaSumBCBC)
        .&&(framelessSumBCBCB ?= scalaSumBCBCB)
    }

    check(forAll(prop[String, Long, BigDecimal, Long, BigDecimal] _))
  }

  test("groupBy('a, 'b).agg(sum('c)) to groupBy('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder,
    OutC: TypedEncoder: Numeric
    ](data: List[X3[A, B, C]])(
      implicit
      summableC: CatalystSummable[C, OutC],
      widenc: C => OutC
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumC = dataset
        .groupBy(A,B)
        .agg(sum(C))
        .collect().run.toVector.sortBy(x => (x._1,x._2))

      val scalaSumC = data.groupBy(x => (x.a,x.b)).mapValues { xs =>
        xs.map(_.c).map(widenc).sum
      }.toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1,x._2))

      val framelessSumCC = dataset
        .groupBy(A,B)
        .agg(sum(C), sum(C))
        .collect().run.toVector.sortBy(x => (x._1,x._2))

      val scalaSumCC = data.groupBy(x => (x.a,x.b)).mapValues { xs =>
        val s = xs.map(_.c).map(widenc).sum; (s,s)
      }.toVector.map { case ((a, b), (c1, c2)) => (a, b, c1, c2) }.sortBy(x => (x._1,x._2))

      val framelessSumCCC = dataset
        .groupBy(A,B)
        .agg(sum(C), sum(C), sum(C))
        .collect().run.toVector.sortBy(x => (x._1,x._2))

      val scalaSumCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs =>
        val s = xs.map(_.c).map(widenc).sum; (s,s,s)
      }.toVector.map { case ((a, b), (c1, c2, c3)) => (a, b, c1, c2, c3) }.sortBy(x => (x._1,x._2))

      val framelessSumCCCC = dataset
        .groupBy(A,B)
        .agg(sum(C), sum(C), sum(C), sum(C))
        .collect().run.toVector.sortBy(x => (x._1,x._2))

      val scalaSumCCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs =>
        val s = xs.map(_.c).map(widenc).sum; (s,s,s,s)
      }.toVector.map { case ((a, b), (c1, c2, c3, c4)) => (a, b, c1, c2, c3, c4) }.sortBy(x => (x._1,x._2))

      val framelessSumCCCCC = dataset
        .groupBy(A,B)
        .agg(sum(C), sum(C), sum(C), sum(C), sum(C))
        .collect().run.toVector.sortBy(x => (x._1,x._2))

      val scalaSumCCCCC = data.groupBy(x => (x.a,x.b)).mapValues { xs =>
        val s = xs.map(_.c).map(widenc).sum; (s,s,s,s,s)
      }.toVector.map { case ((a, b), (c1, c2, c3, c4, c5)) => (a, b, c1, c2, c3, c4, c5) }.sortBy(x => (x._1,x._2))

      (framelessSumC ?= scalaSumC) &&
        (framelessSumCC ?= scalaSumCC) &&
        (framelessSumCCC ?= scalaSumCCC) &&
        (framelessSumCCCC ?= scalaSumCCCC) &&
        (framelessSumCCCCC ?= scalaSumCCCCC)
    }

    check(forAll(prop[String, Long, BigDecimal, BigDecimal] _))
  }

  test("groupBy('a, 'b).agg(sum('c), sum('d))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder : Ordering,
      C: TypedEncoder,
      D: TypedEncoder,
      OutC: TypedEncoder : Numeric,
      OutD: TypedEncoder : Numeric
    ](data: List[X4[A, B, C, D]])(
      implicit
      summableC: CatalystSummable[C, OutC],
      summableD: CatalystSummable[D, OutD],
      widenc: C => OutC,
      widend: D => OutD
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)

      val datasetSumByAB = dataset
        .groupBy(A, B)
        .agg(sum(C), sum(D))
        .collect().run.toVector.sortBy(x => (x._1, x._2))

      val sumByAB = data.groupBy(x => (x.a, x.b)).mapValues { xs =>
        (xs.map(_.c).map(widenc).sum, xs.map(_.d).map(widend).sum)
      }.toVector.map {
        case ((a, b), (c, d)) => (a, b, c, d)
      }.sortBy(x => (x._1, x._2))

      datasetSumByAB ?= sumByAB
    }

    check(forAll(prop[Byte, Int, Long, BigDecimal, Long, BigDecimal] _))
  }

  test("groupBy('a, 'b).mapGroups('a, 'b, sum('c))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder : Ordering,
      C: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val datasetSumByAB = dataset
        .groupBy(A, B)
        .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) }
        .collect().run().toVector.sortBy(x => (x._1, x._2))

      val sumByAB = data.groupBy(x => (x.a, x.b))
        .mapValues { xs => xs.map(_.c).sum }
        .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2))

      datasetSumByAB ?= sumByAB
    }

    check(forAll(prop[Byte, Int, Long] _))
  }

  test("groupBy('a).mapGroups(('a, toVector(('a, 'b))") {
    def prop[
      A: TypedEncoder: Ordering,
      B: TypedEncoder: Ordering
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .groupBy(A)
        .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted))
        .collect().run.toMap

      val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted }

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("groupBy('a).flatMapGroups(('a, toVector(('a, 'b))") {
    def prop[
      A: TypedEncoder : Ordering,
      B: TypedEncoder : Ordering
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .groupBy(A)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run
        .sorted

      val dataGrouped = data
        .groupBy(_.a).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("groupBy('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder : Ordering
    ](data: Vector[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val cA = dataset.col[A]('a)
      val cB = dataset.col[B]('b)

      val datasetGrouped = dataset
        .groupBy(cA, cB)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run()
        .sorted

      val dataGrouped = data
        .groupBy(t => (t.a,t.b)).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short], Long] _))
    check(forAll(prop[Option[Short], Short, Int] _))
    check(forAll(prop[X1[Option[Short]], Short, Byte] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/InjectionTests.scala
================================================
package frameless

import frameless.CollectTests.prop
import org.scalacheck._
import org.scalacheck.Prop._
import shapeless.test.illTyped

sealed trait Country
case object France extends Country
case object Russia extends Country

object Country {
  implicit val arbitrary: Arbitrary[Country] =
    Arbitrary(Arbitrary.arbitrary[Boolean].map(injection.invert))

  implicit val injection: Injection[Country, Boolean] =
    Injection(France.==, if (_) France else Russia)
}

sealed trait Food
case object Burger extends Food
case object Pasta extends Food
case object Rice extends Food

object Food {
  implicit val arbitrary: Arbitrary[Food] =
    Arbitrary(Arbitrary.arbitrary[Int].map(i => injection.invert(Math.abs(i % 3))))

  implicit val injection: Injection[Food, Int] =
    Injection(
      {
        case Burger => 0
        case Pasta => 1
        case Rice => 2
      },
      {
        case 0 => Burger
        case 1 => Pasta
        case 2 => Rice
      }
    )
}

// Supposingly coming from a java lib, shapeless can't derive stuff for this one :(
class LocalDateTime {
  var instant: Long = _

  override def equals(o: Any): Boolean =
    o.isInstanceOf[LocalDateTime] && o.asInstanceOf[LocalDateTime].instant == instant
}

object LocalDateTime {
  implicit val arbitrary: Arbitrary[LocalDateTime] =
    Arbitrary(Arbitrary.arbitrary[Long].map(injection.invert))

  implicit val injection: Injection[LocalDateTime, Long] =
    Injection(
      _.instant,
      long => { val ldt = new LocalDateTime; ldt.instant = long; ldt }
    )
}

case class Person(age: Int, name: String)

object Person {
  val tupled = (Person.apply _).tupled

  implicit val arbitrary: Arbitrary[Person] =
    Arbitrary(Arbitrary.arbTuple2[Int, String].arbitrary.map(tupled))

  implicit val injection: Injection[Person, (Int, String)] =
    Injection(p => unapply(p).get, tupled)
}

case class I[A](value: A)

object I {
  implicit def injection[A]: Injection[I[A], A] = Injection(_.value, I(_))
  implicit def typedEncoder[A: TypedEncoder]: TypedEncoder[I[A]] = TypedEncoder.usingInjection[I[A], A]
  implicit def arbitrary[A: Arbitrary]: Arbitrary[I[A]] = Arbitrary(Arbitrary.arbitrary[A].map(I(_)))
}

sealed trait Employee
case object Casual extends Employee
case object PartTime extends Employee
case object FullTime extends Employee

object Employee {
  implicit val arbitrary: Arbitrary[Employee] =
    Arbitrary(Gen.oneOf(Casual, PartTime, FullTime))
}

sealed trait Maybe
case object Nothing extends Maybe
case class Just(get: Int) extends Maybe

sealed trait Switch
object Switch {
  case object Off extends Switch
  case object On extends Switch

  implicit val arbitrary: Arbitrary[Switch] =
    Arbitrary(Gen.oneOf(Off, On))
}

sealed trait Pixel
case class Red() extends Pixel
case class Green() extends Pixel
case class Blue() extends Pixel

object Pixel {
  implicit val arbitrary: Arbitrary[Pixel] =
    Arbitrary(Gen.oneOf(Red(), Green(), Blue()))
}

sealed trait Connection[+A]
case object Closed extends Connection[Nothing]
case object Open extends Connection[Nothing]

object Connection {
  implicit def arbitrary[A]: Arbitrary[Connection[A]] =
    Arbitrary(Gen.oneOf(Closed, Open))
}

sealed abstract class Vehicle(colour: String)
case object Car extends Vehicle("red")
case object Bike extends Vehicle("blue")

object Vehicle {
  implicit val arbitrary: Arbitrary[Vehicle] =
    Arbitrary(Gen.oneOf(Car, Bike))
}

class InjectionTests extends TypedDatasetSuite {
  test("Injection based encoders") {
    check(forAll(prop[Country] _))
    check(forAll(prop[LocalDateTime] _))
    check(forAll(prop[Food] _))
    check(forAll(prop[X1[Country]] _))
    check(forAll(prop[X1[LocalDateTime]] _))
    check(forAll(prop[X1[Food]] _))
    check(forAll(prop[X1[X1[Country]]] _))
    check(forAll(prop[X1[X1[LocalDateTime]]] _))
    check(forAll(prop[X1[X1[Food]]] _))
    check(forAll(prop[X2[Country, X2[LocalDateTime, Food]]] _))
    check(forAll(prop[X3[Country, LocalDateTime, Food]] _))
    check(forAll(prop[X3U[Country, LocalDateTime, Food]] _))

    check(forAll(prop[I[Int]] _))
    check(forAll(prop[I[Option[Int]]] _))
    check(forAll(prop[I[I[Int]]] _))
    check(forAll(prop[I[I[Option[Int]]]] _))

    check(forAll(prop[I[X1[Int]]] _))
    check(forAll(prop[I[I[X1[Int]]]] _))
    check(forAll(prop[I[I[Option[X1[Int]]]]] _))

    check(forAll(prop[Option[I[Int]]] _))
    check(forAll(prop[Option[I[X1[Int]]]] _))

    assert(TypedEncoder[I[Int]].catalystRepr == TypedEncoder[Int].catalystRepr)
    assert(TypedEncoder[I[I[Int]]].catalystRepr == TypedEncoder[Int].catalystRepr)

    assert(TypedEncoder[I[Option[Int]]].nullable)
  }

  test("TypedEncoder[Person] is ambiguous") {
    illTyped("implicitly[TypedEncoder[Person]]", "ambiguous implicit values.*")
  }

  test("Resolve ambiguity by importing usingInjection") {
    import TypedEncoder.usingInjection

    check(forAll(prop[X1[Person]] _))
    check(forAll(prop[X1[X1[Person]]] _))
    check(forAll(prop[X2[Person, Person]] _))
    check(forAll(prop[Person] _))

    assert(TypedEncoder[Person].catalystRepr == TypedEncoder[(Int, String)].catalystRepr)
  }

  test("Resolve ambiguity by importing usingDerivation") {
    import TypedEncoder.usingDerivation
    assert(implicitly[TypedEncoder[Person]].isInstanceOf[RecordEncoder[Person, _, _]])
    check(forAll(prop[Person] _))
  }

  test("TypedEncoder[Employee] implicit is missing") {
    illTyped(
      "implicitly[TypedEncoder[Employee]]",
      "could not find implicit value for parameter e.*"
    )
  }

  test("Resolve missing implicit by deriving Injection instance") {
    import frameless.TypedEncoder.injections._

    check(forAll(prop[X1[Employee]] _))
    check(forAll(prop[X1[X1[Employee]]] _))
    check(forAll(prop[X2[Employee, Employee]] _))
    check(forAll(prop[Employee] _))

    assert(TypedEncoder[Employee].catalystRepr == TypedEncoder[String].catalystRepr)
  }

  test("TypedEncoder[Maybe] cannot be derived") {
    import frameless.TypedEncoder.injections._

    illTyped(
      "implicitly[TypedEncoder[Maybe]]",
      "could not find implicit value for parameter e.*"
    )
  }

  test("Derive encoder for type with data constructors defined in the companion object") {
    import frameless.TypedEncoder.injections._

    check(forAll(prop[X1[Switch]] _))
    check(forAll(prop[X1[X1[Switch]]] _))
    check(forAll(prop[X2[Switch, Switch]] _))
    check(forAll(prop[Switch] _))

    assert(TypedEncoder[Switch].catalystRepr == TypedEncoder[String].catalystRepr)
  }

  test("Derive encoder for type with data constructors defined as parameterless case classes") {
    import frameless.TypedEncoder.injections._

    check(forAll(prop[X1[Pixel]] _))
    check(forAll(prop[X1[X1[Pixel]]] _))
    check(forAll(prop[X2[Pixel, Pixel]] _))
    check(forAll(prop[Pixel] _))

    assert(TypedEncoder[Pixel].catalystRepr == TypedEncoder[String].catalystRepr)
  }

  test("Derive encoder for phantom type") {
    import frameless.TypedEncoder.injections._

    check(forAll(prop[X1[Connection[Int]]] _))
    check(forAll(prop[X1[X1[Connection[Int]]]] _))
    check(forAll(prop[X2[Connection[Int], Connection[Int]]] _))
    check(forAll(prop[Connection[Int]] _))

    assert(TypedEncoder[Connection[Int]].catalystRepr == TypedEncoder[String].catalystRepr)
  }

  test("Derive encoder for ADT with abstract class as the base type") {
    import frameless.TypedEncoder.injections._

    check(forAll(prop[X1[Vehicle]] _))
    check(forAll(prop[X1[X1[Vehicle]]] _))
    check(forAll(prop[X2[Vehicle, Vehicle]] _))
    check(forAll(prop[Vehicle] _))

    assert(TypedEncoder[Vehicle].catalystRepr == TypedEncoder[String].catalystRepr)
  }

  test("apply method of derived Injection instance produces the correct string") {
    import frameless.TypedEncoder.injections._

    assert(implicitly[Injection[Employee, String]].apply(Casual) === "Casual")
    assert(implicitly[Injection[Switch, String]].apply(Switch.On) === "On")
    assert(implicitly[Injection[Pixel, String]].apply(Blue()) === "Blue")
    assert(implicitly[Injection[Connection[Int], String]].apply(Open) === "Open")
    assert(implicitly[Injection[Vehicle, String]].apply(Bike) === "Bike")
  }

  test("invert method of derived Injection instance produces the correct value") {
    import frameless.TypedEncoder.injections._

    assert(implicitly[Injection[Employee, String]].invert("Casual") === Casual)
    assert(implicitly[Injection[Switch, String]].invert("On") === Switch.On)
    assert(implicitly[Injection[Pixel, String]].invert("Blue") === Blue())
    assert(implicitly[Injection[Connection[Int], String]].invert("Open") === Open)
    assert(implicitly[Injection[Vehicle, String]].invert("Bike") === Bike)
  }

  test(
    "invert method of derived Injection instance should throw exception if string does not match data constructor names"
  ) {
    import frameless.TypedEncoder.injections._

    val caught = intercept[IllegalArgumentException] {
      implicitly[Injection[Employee, String]].invert("cassual")
    }

    assert(
      caught.getMessage ===
        "Cannot construct a value of type CNil: cassual did not match data constructor names"
    )
  }
}


================================================
FILE: dataset/src/test/scala/frameless/IsValueClassTests.scala
================================================
package frameless

import shapeless.Refute
import shapeless.test.illTyped

import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

final class IsValueClassTests extends AnyFunSuite with Matchers {
  test("Case class is not Value class") {
    illTyped("IsValueClass[P]")
    illTyped("IsValueClass[Q]")
  }

  test("Scala value type is not Value class (excluded)") {
    illTyped("implicitly[IsValueClass[Double]]")
    illTyped("implicitly[IsValueClass[Float]]")
    illTyped("implicitly[IsValueClass[Long]]")
    illTyped("implicitly[IsValueClass[Int]]")
    illTyped("implicitly[IsValueClass[Char]]")
    illTyped("implicitly[IsValueClass[Short]]")
    illTyped("implicitly[IsValueClass[Byte]]")
    illTyped("implicitly[IsValueClass[Unit]]")
    illTyped("implicitly[IsValueClass[Boolean]]")
    illTyped("implicitly[IsValueClass[BigDecimal]]")
  }

  test("Value class evidence") {
    implicitly[IsValueClass[RecordEncoderTests.Name]]
    illTyped("implicitly[Refute[IsValueClass[RecordEncoderTests.Name]]]")
  }
}


================================================
FILE: dataset/src/test/scala/frameless/JobTests.scala
================================================
package frameless

import org.scalacheck.Arbitrary
import org.scalatest.BeforeAndAfterAll
import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks
import org.scalatest.freespec.AnyFreeSpec
import org.scalatest.matchers.should.Matchers


class JobTests extends AnyFreeSpec with BeforeAndAfterAll with SparkTesting with ScalaCheckDrivenPropertyChecks with Matchers {

  "map" - {
    "identity" in {
      def check[T](implicit arb: Arbitrary[T]) = forAll {
        t: T => Job(t).map(identity).run() shouldEqual Job(t).run()
      }

      check[Int]
    }

    val f1: Int => Int = _ + 1
    val f2: Int => Int = (i: Int) => i * i

    "composition" in forAll {
      i: Int => Job(i).map(f1).map(f2).run() shouldEqual Job(i).map(f1 andThen f2).run()
    }
  }

  "flatMap" - {
    val f1: Int => Job[Int] = (i: Int) => Job(i + 1)
    val f2: Int => Job[Int] = (i: Int) => Job(i * i)

    "left identity" in forAll {
      i: Int => Job(i).flatMap(f1).run() shouldEqual f1(i).run()
    }

    "right identity" in forAll {
      i: Int => Job(i).flatMap(i => Job.apply(i)).run() shouldEqual Job(i).run()
    }

    "associativity" in forAll {
      i: Int => Job(i).flatMap(f1).flatMap(f2).run() shouldEqual Job(i).flatMap(ii => f1(ii).flatMap(f2)).run()
    }
  }

  "properties" - {
    "read back" in forAll {
      (k:String, v: String) =>
        val scopedKey = "frameless.tests." + k
        Job(1).withLocalProperty(scopedKey,v).run()
        sc.getLocalProperty(scopedKey) shouldBe v
    }
  }
}

================================================
FILE: dataset/src/test/scala/frameless/JoinTests.scala
================================================
package frameless

import org.apache.spark.sql.types.{StructField, StructType}
import org.scalacheck.Prop
import org.scalacheck.Prop._

class JoinTests extends TypedDatasetSuite {
  test("ab.joinCross(ac)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val joinedDs = leftDs
        .joinCross(rightDs)

      val joinedData = joinedDs.collect().run().toVector.sorted

      val joined = {
        for {
          ab <- left
          ac <- right
        } yield (ab, ac)
      }.toVector

      val equalSchemas = joinedDs.schema ?= StructType(Seq(
        StructField("_1", leftDs.schema, nullable = false),
        StructField("_2", rightDs.schema, nullable = false)))

      (joined.sorted ?= joinedData) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinFull(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val joinedDs = leftDs
        .joinFull(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val rightKeys = right.map(_.a).toSet
      val leftKeys  = left.map(_.a).toSet
      val joined = {
        for {
          ab <- left
          ac <- right if ac.a == ab.a
        } yield (Some(ab), Some(ac))
      }.toVector ++ {
        for {
          ab <- left if !rightKeys.contains(ab.a)
        } yield (Some(ab), None)
      }.toVector ++ {
        for {
          ac <- right if !leftKeys.contains(ac.a)
        } yield (None, Some(ac))
      }.toVector

      val equalSchemas = joinedDs.schema ?= StructType(Seq(
        StructField("_1", leftDs.schema, nullable = true),
        StructField("_2", rightDs.schema, nullable = true)))

      (joined.sorted ?= joinedData) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinInner(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val joinedDs = leftDs
        .joinInner(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val joined = {
        for {
          ab <- left
          ac <- right if ac.a == ab.a
        } yield (ab, ac)
      }.toVector

      val equalSchemas = joinedDs.schema ?= StructType(Seq(
        StructField("_1", leftDs.schema, nullable = false),
        StructField("_2", rightDs.schema, nullable = false)))

      (joined.sorted ?= joinedData) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinLeft(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val joinedDs = leftDs
        .joinLeft(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val rightKeys = right.map(_.a).toSet
      val joined = {
        for {
          ab <- left
          ac <- right if ac.a == ab.a
        } yield (ab, Some(ac))
      }.toVector ++ {
        for {
          ab <- left if !rightKeys.contains(ab.a)
        } yield (ab, None)
      }.toVector

      val equalSchemas = joinedDs.schema ?= StructType(Seq(
        StructField("_1", leftDs.schema, nullable = false),
        StructField("_2", rightDs.schema, nullable = true)))

      (joined.sorted ?= joinedData) && (joinedData.map(_._1).toSet ?= left.toSet) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinLeftAnti(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val rightKeys = right.map(_.a).toSet
      val joinedDs = leftDs
        .joinLeftAnti(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val joined = {
        for {
          ab <- left if !rightKeys.contains(ab.a)
        } yield ab
      }.toVector

      val equalSchemas = joinedDs.schema ?= leftDs.schema

      (joined.sorted ?= joinedData) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinLeftSemi(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val rightKeys = right.map(_.a).toSet
      val joinedDs = leftDs
        .joinLeftSemi(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val joined = {
        for {
          ab <- left if rightKeys.contains(ab.a)
        } yield ab
      }.toVector

      val equalSchemas = joinedDs.schema ?= leftDs.schema

      (joined.sorted ?= joinedData) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }

  test("ab.joinRight(ac)(ab.a == ac.a)") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering,
      C : TypedEncoder : Ordering
    ](left: List[X2[A, B]], right: List[X2[A, C]]): Prop = {
      val leftDs = TypedDataset.create(left)
      val rightDs = TypedDataset.create(right)
      val joinedDs = leftDs
        .joinRight(rightDs)(leftDs.col('a) === rightDs.col('a))

      val joinedData = joinedDs.collect().run().toVector.sorted

      val leftKeys = left.map(_.a).toSet
      val joined = {
        for {
          ab <- left
          ac <- right if ac.a == ab.a
        } yield (Some(ab), ac)
      }.toVector ++ {
        for {
          ac <- right if !leftKeys.contains(ac.a)
        } yield (None, ac)
      }.toVector

      val equalSchemas = joinedDs.schema ?= StructType(Seq(
        StructField("_1", leftDs.schema, nullable = true),
        StructField("_2", rightDs.schema, nullable = false)))

      (joined.sorted ?= joinedData) && (joinedData.map(_._2).toSet ?= right.toSet) && equalSchemas
    }

    check(forAll(prop[Int, Long, String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/LitTests.scala
================================================
package frameless

import frameless.functions.lit

import org.scalatest.matchers.should.Matchers

import org.scalacheck.Prop, Prop._

import RecordEncoderTests.Name

class LitTests extends TypedDatasetSuite with Matchers {
  def prop[A: TypedEncoder](value: A)(implicit i0: shapeless.Refute[IsValueClass[A]]): Prop = {
    val df: TypedDataset[Int] = TypedDataset.create(1 :: Nil)

    val l: TypedColumn[Int, A] = lit(value)

    // filter forces whole codegen
    val elems = df.deserialized.filter((_:Int) => true).select(l)
      .collect()
      .run()
      .toVector

    // otherwise it uses local relation
    val localElems = df.select(l)
      .collect()
      .run()
      .toVector

    val expected = Vector(value)

    (localElems ?= expected) && (elems ?= expected)
  }

  test("select(lit(...))") {
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)

    check(prop[Option[Int]] _)
    check(prop[Option[String]] _)

    check(prop[Vector[Long]] _)
    check(prop[Vector[X1[Long]]] _)

    check(prop[Vector[String]] _)
    check(prop[Vector[X1[String]]] _)

    check(prop[X1[Int]] _)
    check(prop[X1[X1[Int]]] _)

    check(prop[Food] _)

    // doesn't work, object has to be serializable
    // check(prop[frameless.LocalDateTime] _)
  }

  test("support value class") {
    val initial = Seq(
      Q(name = new Name("Foo"), id = 1),
      Q(name = new Name("Bar"), id = 2))
    val ds = TypedDataset.create(initial)

    ds.collect.run() shouldBe initial

    val lorem = new Name("Lorem")

    ds.withColumnReplaced('name, functions.litValue(lorem)).
      collect.run() shouldBe initial.map(_.copy(name = lorem))
  }

  test("support optional value class") {
    val initial = Seq(
      R(name = "Foo", id = 1, alias = None),
      R(name = "Bar", id = 2, alias = Some(new Name("Lorem"))))
    val ds = TypedDataset.create(initial)

    ds.collect.run() shouldBe initial

    val someIpsum: Option[Name] = Some(new Name("Ipsum"))

    val lit = functions.litValue(someIpsum)
    val tds = ds.withColumnReplaced('alias, functions.litValue(someIpsum))

    tds.queryExecution.toString() should include (lit.toString)

    tds.
      collect.run() shouldBe initial.map(_.copy(alias = someIpsum))

    ds.withColumnReplaced('alias, functions.litValue(Option.empty[Name])).
      collect.run() shouldBe initial.map(_.copy(alias = None))
  }

  test("#205: comparing literals encoded using Injection") {
    import org.apache.spark.sql.catalyst.util.DateTimeUtils
    implicit val dateAsInt: Injection[java.sql.Date, Int] =
      Injection(DateTimeUtils.fromJavaDate, DateTimeUtils.toJavaDate)

    val today = new java.sql.Date(System.currentTimeMillis)
    val data = Vector(P(42, today))
    val tds = TypedDataset.create(data)

    tds.filter(tds('d) === today).collect.run().map(_.i) shouldBe Seq(42)
  }
}

final case class P(i: Int, d: java.sql.Date)

final case class Q(id: Int, name: Name)

final case class R(id: Int, name: String, alias: Option[Name])


================================================
FILE: dataset/src/test/scala/frameless/NumericTests.scala
================================================
package frameless

import org.apache.spark.sql.Encoder
import org.scalacheck.{Arbitrary, Gen, Prop}
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers

import scala.reflect.ClassTag

class NumericTests extends TypedDatasetSuite with Matchers {
  test("plus") {
    def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[Numeric[A]].plus(a, b)
      val got = df.select(df.col('a) + df.col('b)).collect().run()

      got ?= (result :: Nil)
    }

    check(prop[BigDecimal] _)
    check(prop[Byte] _)
    check(prop[Double] _)
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[Short] _)
  }

  test("minus") {
    def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[Numeric[A]].minus(a, b)
      val got = df.select(df.col('a) - df.col('b)).collect().run()

      got ?= (result :: Nil)
    }

    check(prop[BigDecimal] _)
    check(prop[Byte] _)
    check(prop[Double] _)
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[Short] _)
  }

  test("multiply") {
    def prop[A: TypedEncoder : CatalystNumeric : Numeric : ClassTag](a: A, b: A): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = implicitly[Numeric[A]].times(a, b)
      val got = df.select(df.col('a) * df.col('b)).collect().run()

      got ?= (result :: Nil)
    }

    check(prop[Byte] _)
    check(prop[Double] _)
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[Short] _)
  }

  test("divide") {
    def prop[A: TypedEncoder: CatalystNumeric: Numeric](a: A, b: A)(implicit cd: CatalystDivisible[A, Double]): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      if (b == 0) proved else {
        val div: Double = implicitly[Numeric[A]].toDouble(a) / implicitly[Numeric[A]].toDouble(b)
        val got: Seq[Double] = df.select(df.col('a) / df.col('b)).collect().run()

        got ?= (div :: Nil)
      }
    }

    check(prop[Byte  ] _)
    check(prop[Double] _)
    check(prop[Int   ] _)
    check(prop[Long  ] _)
    check(prop[Short ] _)
  }

  test("divide BigDecimals") {
    def prop(a: BigDecimal, b: BigDecimal): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      if (b.doubleValue == 0) proved else {
        // Spark performs something in between Double division and BigDecimal division,
        // we approximate it using double vision and `approximatelyEqual`:
        val div = BigDecimal(a.doubleValue / b.doubleValue)
        val got = df.select(df.col('a) / df.col('b)).collect().run()
        approximatelyEqual(got.head, div)
      }
    }

    check(prop _)
  }

  test("multiply BigDecimal") {
    def prop(a: BigDecimal, b: BigDecimal): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      val result = BigDecimal(a.doubleValue * b.doubleValue)
      val got = df.select(df.col('a) * df.col('b)).collect().run()
      approximatelyEqual(got.head, result)
    }

    check(prop _)
  }

  trait NumericMod[T] {
    def mod(a: T, b: T): T
  }

  object NumericMod {
    implicit val byteInstance = new NumericMod[Byte] {
      def mod(a: Byte, b: Byte) = (a % b).toByte
    }
    implicit val doubleInstance = new NumericMod[Double] {
      def mod(a: Double, b: Double) = a % b
    }
    implicit val floatInstance = new NumericMod[Float] {
      def mod(a: Float, b: Float) = a % b
    }
    implicit val intInstance = new NumericMod[Int] {
      def mod(a: Int, b: Int) = a % b
    }
    implicit val longInstance = new NumericMod[Long] {
      def mod(a: Long, b: Long) = a % b
    }
    implicit val shortInstance = new NumericMod[Short] {
      def mod(a: Short, b: Short) = (a % b).toShort
    }
    implicit val bigDecimalInstance = new NumericMod[BigDecimal] {
      def mod(a: BigDecimal, b: BigDecimal) = a % b
    }
  }

  test("mod") {
    import NumericMod._

    def prop[A: TypedEncoder : CatalystNumeric : NumericMod](a: A, b: A): Prop = {
      val df = TypedDataset.create(X2(a, b) :: Nil)
      if (b == 0) proved else {
        val mod: A = implicitly[NumericMod[A]].mod(a, b)
        val got: Seq[A] = df.select(df.col('a) % df.col('b)).collect().run()

        got ?= (mod :: Nil)
      }
    }

    check(prop[Byte] _)
    check(prop[Double] _)
    check(prop[Int   ] _)
    check(prop[Long  ] _)
    check(prop[Short ] _)
    check(prop[BigDecimal] _)
  }

  test("a mod lit(b)"){
    import NumericMod._

    def prop[A: TypedEncoder : CatalystNumeric : NumericMod](elem: A, data: X1[A]): Prop = {
      val dataset = TypedDataset.create(Seq(data))
      val a = dataset.col('a)
      if (elem == 0) proved else {
        val mod: A = implicitly[NumericMod[A]].mod(data.a, elem)
        val got: Seq[A] = dataset.select(a % elem).collect().run()

        got ?= (mod :: Nil)
      }
    }

    check(prop[Byte] _)
    check(prop[Double] _)
    check(prop[Int   ] _)
    check(prop[Long  ] _)
    check(prop[Short ] _)
    check(prop[BigDecimal] _)
  }

  test("isNaN") {
    val spark = session
    import spark.implicits._

    implicit val doubleWithNaN = Arbitrary {
      implicitly[Arbitrary[Double]].arbitrary.flatMap(Gen.oneOf(_, Double.NaN))
    }
    implicit val x1 = Arbitrary{ doubleWithNaN.arbitrary.map(X1(_)) }

    def prop[A : TypedEncoder : Encoder : CatalystNaN](data: List[X1[A]]): Prop = {
      val ds = TypedDataset.create(data)

      val expected = ds.toDF().filter(!$"a".isNaN).map(_.getAs[A](0)).collect().toSeq
      val rs = ds.filter(!ds('a).isNaN).collect().run().map(_.a)

      rs ?= expected
    }

    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
  }

  test("isNaN with non-nan types should not compile") {
    val ds = TypedDataset.create((1, false, 'a, "b") :: Nil)

    "ds.filter(ds('_1).isNaN)" shouldNot typeCheck
    "ds.filter(ds('_2).isNaN)" shouldNot typeCheck
    "ds.filter(ds('_3).isNaN)" shouldNot typeCheck
    "ds.filter(ds('_4).isNaN)" shouldNot typeCheck
  }
}


================================================
FILE: dataset/src/test/scala/frameless/OrderByTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped
import org.apache.spark.sql.Column
import org.scalatest.matchers.should.Matchers

class OrderByTests extends TypedDatasetSuite with Matchers {
  def sortings[A : CatalystOrdered, T]: Seq[(TypedColumn[T, A] => SortedTypedColumn[T, A], Column => Column)] = Seq(
    (_.desc, _.desc),
    (_.asc, _.asc),
    (t => t, t => t) //default ascending
  )

  test("single column non nullable orderBy") {
    def prop[A: TypedEncoder : CatalystOrdered](data: Vector[X1[A]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X1[A]].map { case (typ, untyp) =>
        ds.dataset.orderBy(untyp(ds.dataset.col("a"))).collect().toVector.?=(
          ds.orderBy(typ(ds('a))).collect().run().toVector)
      }.reduce(_ && _)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[String] _))
  }

  test("single column non nullable partition sorting") {
    def prop[A: TypedEncoder : CatalystOrdered](data: Vector[X1[A]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X1[A]].map { case (typ, untyp) =>
        ds.dataset.sortWithinPartitions(untyp(ds.dataset.col("a"))).collect().toVector.?=(
          ds.sortWithinPartitions(typ(ds('a))).collect().run().toVector)
      }.reduce(_ && _)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Boolean] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Float] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[SQLDate] _))
    check(forAll(prop[SQLTimestamp] _))
    check(forAll(prop[String] _))
  }

  test("two columns non nullable orderBy") {
    def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A,B]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X2[A, B]].reverse.zip(sortings[B, X2[A, B]]).map { case ((typA, untypA), (typB, untypB)) =>
        val vanillaSpark = ds.dataset.orderBy(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b"))).collect().toVector
        vanillaSpark.?=(ds.orderBy(typA(ds('a)), typB(ds('b))).collect().run().toVector).&&(
          vanillaSpark ?= ds.orderByMany(typA(ds('a)), typB(ds('b))).collect().run().toVector
        )
      }.reduce(_ && _)
    }

    check(forAll(prop[SQLDate, Long] _))
    check(forAll(prop[String, Boolean] _))
    check(forAll(prop[SQLTimestamp, Long] _))
  }

  test("two columns non nullable partition sorting") {
    def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A,B]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X2[A, B]].reverse.zip(sortings[B, X2[A, B]]).map { case ((typA, untypA), (typB, untypB)) =>
        val vanillaSpark = ds.dataset.sortWithinPartitions(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b"))).collect().toVector
        vanillaSpark.?=(ds.sortWithinPartitions(typA(ds('a)), typB(ds('b))).collect().run().toVector).&&(
          vanillaSpark ?= ds.sortWithinPartitionsMany(typA(ds('a)), typB(ds('b))).collect().run().toVector
        )
      }.reduce(_ && _)
    }

    check(forAll(prop[SQLDate, Long] _))
    check(forAll(prop[String, Boolean] _))
    check(forAll(prop[SQLTimestamp, Long] _))
  }

  test("three columns non nullable orderBy") {
    def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X3[A,B,A]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X3[A, B, A]].reverse
        .zip(sortings[B, X3[A, B, A]])
        .zip(sortings[A, X3[A, B, A]])
        .map { case (((typA, untypA), (typB, untypB)), (typA2, untypA2)) =>
          val vanillaSpark = ds.dataset
            .orderBy(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b")), untypA2(ds.dataset.col("c")))
            .collect().toVector

          vanillaSpark.?=(ds.orderBy(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector).&&(
            vanillaSpark ?= ds.orderByMany(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector
          )
        }.reduce(_ && _)
    }

    check(forAll(prop[SQLDate, Long] _))
    check(forAll(prop[String, Boolean] _))
    check(forAll(prop[SQLTimestamp, Long] _))
  }

  test("three columns non nullable partition sorting") {
    def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X3[A,B,A]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[A, X3[A, B, A]].reverse
        .zip(sortings[B, X3[A, B, A]])
        .zip(sortings[A, X3[A, B, A]])
        .map { case (((typA, untypA), (typB, untypB)), (typA2, untypA2)) =>
          val vanillaSpark = ds.dataset
            .sortWithinPartitions(untypA(ds.dataset.col("a")), untypB(ds.dataset.col("b")), untypA2(ds.dataset.col("c")))
            .collect().toVector

          vanillaSpark.?=(ds.sortWithinPartitions(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector).&&(
            vanillaSpark ?= ds.sortWithinPartitionsMany(typA(ds('a)), typB(ds('b)), typA2(ds('c))).collect().run().toVector
          )
        }.reduce(_ && _)
    }

    check(forAll(prop[SQLDate, Long] _))
    check(forAll(prop[String, Boolean] _))
    check(forAll(prop[SQLTimestamp, Long] _))
  }

  test("sort support for mixed default and explicit ordering") {
    def prop[A: TypedEncoder : CatalystOrdered, B: TypedEncoder : CatalystOrdered](data: Vector[X2[A, B]]): Prop = {
      val ds = TypedDataset.create(data)

      ds.dataset.orderBy(ds.dataset.col("a"), ds.dataset.col("b").desc).collect().toVector.?=(
        ds.orderByMany(ds('a), ds('b).desc).collect().run().toVector) &&
      ds.dataset.sortWithinPartitions(ds.dataset.col("a"), ds.dataset.col("b").desc).collect().toVector.?=(
        ds.sortWithinPartitionsMany(ds('a), ds('b).desc).collect().run().toVector)
    }

    check(forAll(prop[SQLDate, Long] _))
    check(forAll(prop[String, Boolean] _))
    check(forAll(prop[SQLTimestamp, Long] _))
  }

  test("fail when selected column is not sortable") {
    val d = TypedDataset.create(X2(1, Map(1 -> 2)) :: X2(2, Map(2 -> 2)) :: Nil)
    d.orderBy(d('a).desc)
    illTyped("""d.orderBy(d('b).desc)""")
    illTyped("""d.sortWithinPartitions(d('b).desc)""")
  }

  test("derives a CatalystOrdered for case classes when all fields are comparable") {
    type T[A, B] = X3[Int, Boolean, X2[A, B]]
    def prop[
      A: TypedEncoder : CatalystOrdered,
      B: TypedEncoder : CatalystOrdered
    ](data: Vector[T[A, B]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[X2[A, B], T[A, B]].map { case (typX2, untypX2) =>
        val vanilla   = ds.dataset.orderBy(untypX2(ds.dataset.col("c"))).collect().toVector
        val frameless = ds.orderBy(typX2(ds('c))).collect().run.toVector
        vanilla ?= frameless
      }.reduce(_ && _)
    }

    check(forAll(prop[Int, Long] _))
    check(forAll(prop[(String, SQLDate), Float] _))
    // Check that nested case classes are properly derived too
    check(forAll(prop[X2[Boolean, Float], X4[SQLTimestamp, Double, Short, Byte]] _))
  }

  test("derives a CatalystOrdered for tuples when all fields are comparable") {
    type T[A, B] = X2[Int, (A, B)]
    def prop[
      A: TypedEncoder : CatalystOrdered,
      B: TypedEncoder : CatalystOrdered
    ](data: Vector[T[A, B]]): Prop = {
      val ds = TypedDataset.create(data)

      sortings[(A, B), T[A, B]].map { case (typX2, untypX2) =>
        val vanilla   = ds.dataset.orderBy(untypX2(ds.dataset.col("b"))).collect().toVector
        val frameless = ds.orderBy(typX2(ds('b))).collect().run.toVector
        vanilla ?= frameless
      }.reduce(_ && _)
    }

    check(forAll(prop[Int, Long] _))
    check(forAll(prop[(String, SQLDate), Float] _))
    check(forAll(prop[X2[Boolean, Float], X1[(SQLTimestamp, Double, Short, Byte)]] _))
  }

  test("fails to compile when one of the field isn't comparable") {
    type T = X2[Int, X2[Int, Map[String, String]]]
    val d = TypedDataset.create(X2(1, X2(2, Map("not" -> "comparable"))) :: Nil)
    illTyped("d.orderBy(d('b).desc)", """Cannot compare columns of type frameless.X2\[Int,scala.collection.immutable.Map\[String,String]].""")
  }
}


================================================
FILE: dataset/src/test/scala/frameless/RecordEncoderTests.scala
================================================
package frameless

import org.apache.spark.sql.{Row, functions => F}
import org.apache.spark.sql.types.{
  ArrayType,
  BinaryType,
  DecimalType,
  IntegerType,
  LongType,
  MapType,
  ObjectType,
  StringType,
  StructField,
  StructType
}

import shapeless.{HList, LabelledGeneric}
import shapeless.test.illTyped

import org.scalatest.matchers.should.Matchers

final class RecordEncoderTests extends TypedDatasetSuite with Matchers {
  test("Unable to encode products made from units only") {
    illTyped("TypedEncoder[UnitsOnly]")
  }

  test("Dropping fields") {
    def dropUnitValues[L <: HList](l: L)(implicit d: DropUnitValues[L]): d.Out = d(l)
    val fields = LabelledGeneric[TupleWithUnits].to(TupleWithUnits(42, "something"))
    dropUnitValues(fields) shouldEqual LabelledGeneric[(Int, String)].to((42, "something"))
  }

  test("Representation skips units") {
    assert(TypedEncoder[(Int, String)].catalystRepr == TypedEncoder[TupleWithUnits].catalystRepr)
  }

  test("Serialization skips units") {
    val df = session.createDataFrame(Seq((1, "one"), (2, "two")))
    val ds = df.as[TupleWithUnits](TypedExpressionEncoder[TupleWithUnits])
    val tds = TypedDataset.create(Seq(TupleWithUnits(1, "one"), TupleWithUnits(2, "two")))

    df.collect shouldEqual tds.toDF.collect
    ds.collect.toSeq shouldEqual tds.collect.run
  }

  test("Empty nested record value becomes null on serialization") {
    val ds = TypedDataset.create(Seq(OptionalNesting(Option.empty)))
    val df = ds.toDF
    df.na.drop.count shouldBe 0
  }

  test("Empty nested record value becomes none on deserialization") {
    val rdd = sc.parallelize(Seq(Row(null)))
    val schema = TypedEncoder[OptionalNesting].catalystRepr.asInstanceOf[StructType]
    val df = session.createDataFrame(rdd, schema)
    val ds = TypedDataset.createUnsafe(df)(TypedEncoder[OptionalNesting])

    ds.firstOption.run.get.o.isEmpty shouldBe true
  }

  test("Deeply nested optional values have correct deserialization") {
    val rdd = sc.parallelize(Seq(Row(true, Row(null, null))))
    type NestedOptionPair = X2[Boolean, Option[X2[Option[Int], Option[String]]]]
    val schema = TypedEncoder[NestedOptionPair].catalystRepr.asInstanceOf[StructType]
    val df = session.createDataFrame(rdd, schema)
    val ds = TypedDataset.createUnsafe(df)(TypedEncoder[NestedOptionPair])
    ds.firstOption.run.get shouldBe X2(true, Some(X2(None, None)))
  }

  test("Nesting with Seq") {
    import RecordEncoderTests._

    val obj = C(B(Seq(A(1))))
    val rdd = sc.parallelize(Seq(obj))
    val ds = session.createDataset(rdd)(TypedExpressionEncoder[C])

    ds.collect.head shouldBe obj
  }

  test("Nesting with Set") {
    import RecordEncoderTests._

    val obj = E(Set(B(Seq(A(1)))))
    val rdd = sc.parallelize(Seq(obj))
    val ds = session.createDataset(rdd)(TypedExpressionEncoder[E])

    ds.collect.head shouldBe obj
  }

  test("Scalar value class") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[Name]

    encoder.jvmRepr shouldBe ObjectType(classOf[Name])

    encoder.catalystRepr shouldBe StructType(
      Seq(StructField("value", StringType, false)))

    val sqlContext = session.sqlContext
    import sqlContext.implicits._

    TypedDataset
      .createUnsafe[Name](Seq("Foo", "Bar").toDF)(encoder)
      .collect().run() shouldBe Seq(new Name("Foo"), new Name("Bar"))

  }

  test("Case class with value class field") {
    import RecordEncoderTests._

    illTyped(
      // As `Person` is not a Value class
      "val _: RecordFieldEncoder[Person] = RecordFieldEncoder.valueClass")

    val fieldEncoder: RecordFieldEncoder[Name] = RecordFieldEncoder.valueClass

    fieldEncoder.encoder.catalystRepr shouldBe StringType
    fieldEncoder.encoder.jvmRepr shouldBe ObjectType(classOf[String])

    // Encode as a Person field
    val encoder = TypedEncoder[Person]

    encoder.jvmRepr shouldBe ObjectType(classOf[Person])

    val expectedPersonStructType = StructType(Seq(
      StructField("name", StringType, false),
      StructField("age", IntegerType, false)))

    encoder.catalystRepr shouldBe expectedPersonStructType

    val unsafeDs: TypedDataset[Person] = {
      val rdd = sc.parallelize(Seq(
        Row.fromTuple("Foo" -> 2),
        Row.fromTuple("Bar" -> 3)
      ))
      val df = session.createDataFrame(rdd, expectedPersonStructType)

      TypedDataset.createUnsafe(df)(encoder)
    }

    val expected = Seq(
      Person(new Name("Foo"), 2), Person(new Name("Bar"), 3))

    unsafeDs.collect.run() shouldBe expected

    // Safely created DS
    val safeDs = TypedDataset.create(expected)

    safeDs.collect.run() shouldBe expected

    val lorem = new Name("Lorem")

    safeDs.withColumnReplaced('name, functions.litValue(lorem)).
      collect.run() shouldBe expected.map(_.copy(name = lorem))
  }

  test("Case class with value class as optional field") {
    import RecordEncoderTests._

    illTyped( // As `Person` is not a Value class
      """val _: RecordFieldEncoder[Option[Person]] =
           RecordFieldEncoder.optionValueClass""")

    val fieldEncoder: RecordFieldEncoder[Option[Name]] =
      RecordFieldEncoder.optionValueClass

    fieldEncoder.encoder.catalystRepr shouldBe StringType

    fieldEncoder.encoder. // !StringType
      jvmRepr shouldBe ObjectType(classOf[Option[_]])

    // Encode as a Person field
    val encoder = TypedEncoder[User]

    encoder.jvmRepr shouldBe ObjectType(classOf[User])

    val expectedPersonStructType = StructType(Seq(
      StructField("id", LongType, false),
      StructField("name", StringType, true)))

    encoder.catalystRepr shouldBe expectedPersonStructType

    val ds1: TypedDataset[User] = {
      val rdd = sc.parallelize(Seq(
        Row(1L, null),
        Row(2L, "Foo")
      ))

      val df = session.createDataFrame(rdd, expectedPersonStructType)

      TypedDataset.createUnsafe(df)(encoder)
    }

    ds1.collect.run() shouldBe Seq(
      User(1L, None),
      User(2L, Some(new Name("Foo"))))

    val ds2: TypedDataset[User] = {
      val sqlContext = session.sqlContext
      import sqlContext.implicits._

      val df1 = Seq(
        """{"id":3,"label":"unused"}""",
        """{"id":4,"name":"Lorem"}""",
        """{"id":5,"name":null}"""
      ).toDF

      val df2 = df1.withColumn(
        "jsonValue",
        F.from_json(df1.col("value"), expectedPersonStructType)).
        select("jsonValue.id", "jsonValue.name")

      TypedDataset.createUnsafe[User](df2)
    }

    val expected = Seq(
      User(3L, None),
      User(4L, Some(new Name("Lorem"))),
      User(5L, None))

    ds2.collect.run() shouldBe expected

    // Safely created ds
    TypedDataset.create(expected).collect.run() shouldBe expected
  }

  test("Case class with simple Map") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[D]

    encoder.jvmRepr shouldBe ObjectType(classOf[D])

    val expectedStructType = StructType(Seq(
      StructField("m", MapType(
        keyType = StringType,
        valueType = IntegerType,
        valueContainsNull = false), false)))

    encoder.catalystRepr shouldBe expectedStructType

    val sqlContext = session.sqlContext
    import sqlContext.implicits._

    val ds1 = TypedDataset.createUnsafe[D] {
      val df = Seq(
        """{"m":{"pizza":1,"sushi":2}}""",
        """{"m":{"red":3,"blue":4}}""",
      ).toDF

      df.withColumn(
        "jsonValue",
        F.from_json(df.col("value"), expectedStructType)).
        select("jsonValue.*")
    }

    val expected = Seq(
      D(m = Map("pizza" -> 1, "sushi" -> 2)),
      D(m = Map("red" -> 3, "blue" -> 4)))

    ds1.collect.run() shouldBe expected

    val m2 = Map("updated" -> 5)

    val ds2 = ds1.withColumnReplaced('m, functions.lit(m2))

    ds2.collect.run() shouldBe expected.map(_.copy(m = m2))
  }

  test("Case class with Map & Value class") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[Student]

    encoder.jvmRepr shouldBe ObjectType(classOf[Student])

    val expectedStudentStructType = StructType(Seq(
      StructField("name", StringType, false),
      StructField("grades", MapType(
        keyType = StringType,
        valueType = DecimalType.SYSTEM_DEFAULT,
        valueContainsNull = false), false)))

    encoder.catalystRepr shouldBe expectedStudentStructType

    val sqlContext = session.sqlContext
    import sqlContext.implicits._

    val ds1 = TypedDataset.createUnsafe[Student] {
      val df = Seq(
        """{"name":"Foo","grades":{"math":1,"physics":"23.4"}}""",
        """{"name":"Bar","grades":{"biology":18.5,"geography":4}}""",
      ).toDF

      df.withColumn(
        "jsonValue",
        F.from_json(df.col("value"), expectedStudentStructType)).
        select("jsonValue.*")
    }

    val expected = Seq(
      Student(name = "Foo", grades = Map(
        new Subject("math") -> new Grade(BigDecimal(1)),
        new Subject("physics") -> new Grade(BigDecimal(23.4D)))),
      Student(name = "Bar", grades = Map(
        new Subject("biology") -> new Grade(BigDecimal(18.5)),
        new Subject("geography") -> new Grade(BigDecimal(4L)))))

    ds1.collect.run() shouldBe expected

    val grades = Map[Subject, Grade](
      new Subject("any") -> new Grade(BigDecimal(Long.MaxValue) + 1L))

    val ds2 = ds1.withColumnReplaced('grades, functions.lit(grades))

    ds2.collect.run() shouldBe Seq(
      Student("Foo", grades), Student("Bar", grades))
  }

  test("Encode binary array") {
    val encoder = TypedEncoder[Tuple2[String, Array[Byte]]]

    encoder.jvmRepr shouldBe ObjectType(
      classOf[Tuple2[String, Array[Byte]]])

    val expectedStructType = StructType(Seq(
      StructField("_1", StringType, false),
      StructField("_2", BinaryType, false)))

    encoder.catalystRepr shouldBe expectedStructType

    val ds1: TypedDataset[(String, Array[Byte])] = {
      val rdd = sc.parallelize(Seq(
        Row.fromTuple("Foo" -> Array[Byte](3, 4)),
        Row.fromTuple("Bar" -> Array[Byte](5))
      ))
      val df = session.createDataFrame(rdd, expectedStructType)

      TypedDataset.createUnsafe(df)(encoder)
    }

    val expected = Seq("Foo" -> Seq[Byte](3, 4), "Bar" -> Seq[Byte](5))

    ds1.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected

    val subjects = "lorem".getBytes("UTF-8").toSeq

    val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray))

    ds2.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected.map(_.copy(_2 = subjects))
  }

  test("Encode simple array") {
    val encoder = TypedEncoder[Tuple2[String, Array[Int]]]

    encoder.jvmRepr shouldBe ObjectType(
      classOf[Tuple2[String, Array[Int]]])

    val expectedStructType = StructType(Seq(
      StructField("_1", StringType, false),
      StructField("_2", ArrayType(IntegerType, false), false)))

    encoder.catalystRepr shouldBe expectedStructType

    val sqlContext = session.sqlContext
    import sqlContext.implicits._

    val ds1 = TypedDataset.createUnsafe[(String, Array[Int])] {
      val df = Seq(
        """{"_1":"Foo", "_2":[3, 4]}""",
        """{"_1":"Bar", "_2":[5]}""",
      ).toDF

      df.withColumn(
        "jsonValue",
        F.from_json(df.col("value"), expectedStructType)).
        select("jsonValue.*")
    }

    val expected = Seq("Foo" -> Seq(3, 4), "Bar" -> Seq(5))

    ds1.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected

    val subjects = Seq(6, 6, 7)

    val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray))

    ds2.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected.map(_.copy(_2 = subjects))
  }

  test("Encode array of Value class") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[Tuple2[String, Array[Subject]]]

    encoder.jvmRepr shouldBe ObjectType(
      classOf[Tuple2[String, Array[Subject]]])

    val expectedStructType = StructType(Seq(
      StructField("_1", StringType, false),
      StructField("_2", ArrayType(StringType, false), false)))

    encoder.catalystRepr shouldBe expectedStructType

    val sqlContext = session.sqlContext
    import sqlContext.implicits._

    val ds1 = TypedDataset.createUnsafe[(String, Array[Subject])] {
      val df = Seq(
        """{"_1":"Foo", "_2":["math","physics"]}""",
        """{"_1":"Bar", "_2":["biology","geography"]}""",
      ).toDF

      df.withColumn(
        "jsonValue",
        F.from_json(df.col("value"), expectedStructType)).
        select("jsonValue.*")
    }

    val expected = Seq(
      "Foo" -> Seq(new Subject("math"), new Subject("physics")),
      "Bar" -> Seq(new Subject("biology"), new Subject("geography")))

    ds1.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected

    val subjects = Seq(new Subject("lorem"), new Subject("ipsum"))

    val ds2 = ds1.withColumnReplaced('_2, functions.lit(subjects.toArray))

    ds2.collect.run().map {
      case (_1, _2) => _1 -> _2.toSeq
    } shouldBe expected.map(_.copy(_2 = subjects))
  }

  test("Encode case class with simple Seq") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[B]

    encoder.jvmRepr shouldBe ObjectType(classOf[B])

    val expectedStructType = StructType(Seq(
      StructField("a", ArrayType(StructType(Seq(
        StructField("x", IntegerType, false))), false), false)))

    encoder.catalystRepr shouldBe expectedStructType

    val ds1: TypedDataset[B] = {
      val rdd = sc.parallelize(Seq(
        Row.fromTuple(Tuple1(Seq(
          Row.fromTuple(Tuple1[Int](1)),
          Row.fromTuple(Tuple1[Int](3))
        ))),
        Row.fromTuple(Tuple1(Seq(
          Row.fromTuple(Tuple1[Int](2))
        )))
      ))
      val df = session.createDataFrame(rdd, expectedStructType)

      TypedDataset.createUnsafe(df)(encoder)
    }

    val expected = Seq(B(Seq(A(1), A(3))), B(Seq(A(2))))

    ds1.collect.run() shouldBe expected

    val as = Seq(A(5), A(6))

    val ds2 = ds1.withColumnReplaced('a, functions.lit(as))

    ds2.collect.run() shouldBe expected.map(_.copy(a = as))
  }

  test("Encode case class with Value class") {
    import RecordEncoderTests._

    val encoder = TypedEncoder[Tuple2[Int, Seq[Name]]]

    encoder.jvmRepr shouldBe ObjectType(classOf[Tuple2[Int, Seq[Name]]])

    val expectedStructType = StructType(Seq(
      StructField("_1", IntegerType, false),
      StructField("_2", ArrayType(StringType, false), false)))

    encoder.catalystRepr shouldBe expectedStructType

    val ds1 = TypedDataset.createUnsafe[(Int, Seq[Name])] {
      val sqlContext = session.sqlContext
      import sqlContext.implicits._

      val df = Seq(
        """{"_1":1, "_2":["foo", "bar"]}""",
        """{"_1":2, "_2":["lorem"]}""",
      ).toDF

      df.withColumn(
        "jsonValue",
        F.from_json(df.col("value"), expectedStructType)).
        select("jsonValue.*")
    }

    val expected = Seq(
      1 -> Seq(new Name("foo"), new Name("bar")),
      2 -> Seq(new Name("lorem")))

    ds1.collect.run() shouldBe expected
  }
}

// ---

case class UnitsOnly(a: Unit, b: Unit)

case class TupleWithUnits(
  u0: Unit, _1: Int, u1: Unit, u2: Unit, _2: String, u3: Unit)

object TupleWithUnits {
  def apply(_1: Int, _2: String): TupleWithUnits =
    TupleWithUnits((), _1, (), (), _2, ())
}

case class OptionalNesting(o: Option[TupleWithUnits])

object RecordEncoderTests {
  case class A(x: Int)
  case class B(a: Seq[A])
  case class C(b: B)

  class Name(val value: String) extends AnyVal with Serializable {
    override def toString = s"Name($value)"
  }

  case class Person(name: Name, age: Int)

  case class User(id: Long, name: Option[Name])

  case class D(m: Map[String, Int])
  case class E(b: Set[B])

  final class Subject(val name: String) extends AnyVal with Serializable

  final class Grade(val value: BigDecimal) extends AnyVal with Serializable

  case class Student(name: String, grades: Map[Subject, Grade])
}


================================================
FILE: dataset/src/test/scala/frameless/SchemaTests.scala
================================================
package frameless

import frameless.functions.aggregate._
import frameless.functions._
import org.apache.spark.sql.types.StructType
import org.scalacheck.Prop
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers

class SchemaTests extends TypedDatasetSuite with Matchers {

  def structToNonNullable(struct: StructType): StructType = {
    StructType(struct.fields.map( f => f.copy(nullable = false)))
  }

  def prop[A](dataset: TypedDataset[A], ignoreNullable: Boolean = false): Prop = {
    val schema = dataset.dataset.schema

    Prop.all(
      if (!ignoreNullable)
        dataset.schema ?= schema
      else
        structToNonNullable(dataset.schema) ?= structToNonNullable(schema),
      if (!ignoreNullable)
        TypedExpressionEncoder.targetStructType(dataset.encoder) ?= schema
      else
        structToNonNullable(TypedExpressionEncoder.targetStructType(dataset.encoder))  ?= structToNonNullable(schema)
    )
  }

  test("schema of groupBy('a).agg(sum('b))") {
    val df0 = TypedDataset.create(X2(1L, 1L) :: Nil)
    val _a = df0.col('a)
    val _b = df0.col('b)

    val df = df0.groupBy(_a).agg(sum(_b))

    check(prop(df, true))
  }

  test("schema of select(lit(1L))") {
    val df0 = TypedDataset.create("test" :: Nil)
    val df = df0.select(lit(1L))

    check(prop(df))
  }

  test("schema of select(lit(1L), lit(2L)).as[X2[Long, Long]]") {
    val df0 = TypedDataset.create("test" :: Nil)
    val df = df0.select(lit(1L), lit(2L)).as[X2[Long, Long]]

    check(prop(df))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/SelectTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped
import scala.reflect.ClassTag

class SelectTests extends TypedDatasetSuite {
  test("select('a) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val dataset2 = dataset.select(A).collect().run().toVector
      val data2 = data.map { case X4(a, _, _, _) => a }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[X2[Int, Int], Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[UdtEncodedClass, Int, Int, Int] _))
  }

  test("select('a, 'b) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      eab: TypedEncoder[(A, B)],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val dataset2 = dataset.select(A, B).collect().run().toVector
      val data2 = data.map { case X4(a, b, _, _) => (a, b) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, String, Int, Int] _))
  }

  test("select('a, 'b, 'c) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      eab: TypedEncoder[(A, B, C)],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val dataset2 = dataset.select(A, B, C).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, _) => (a, b, c) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, String, Int, Int] _))
  }

  test("select('a,'b,'c,'d) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a, 'c) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a,'c,'b) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a,'c,'b, 'a) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a,'c,'b,'a,'c) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1, a3).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a, c) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a,'b,'c,'d,'a,'c,'b,'a,'c, 'd) FROM abcd") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      eb: TypedEncoder[B],
      ec: TypedEncoder[C],
      ed: TypedEncoder[D],
      ex4: TypedEncoder[X4[A, B, C, D]],
      ca: ClassTag[A]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val a1 = dataset.col[A]('a)
      val a2 = dataset.col[B]('b)
      val a3 = dataset.col[C]('c)
      val a4 = dataset.col[D]('d)

      val dataset2 = dataset.select(a1, a2, a3, a4, a1, a3, a2, a1, a3, a4).collect().run().toVector
      val data2 = data.map { case X4(a, b, c, d) => (a, b, c, d, a, c, b, a, c, d) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[String, Boolean, Int, Float] _))
  }

  test("select('a.b)") {
    def prop[A, B, C](data: Vector[X2[X2[A, B], C]])(
      implicit
      eabc: TypedEncoder[X2[X2[A, B], C]],
      eb: TypedEncoder[B],
      cb: ClassTag[B]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val AB = dataset.colMany('a, 'b)

      val dataset2 = dataset.select(AB).collect().run().toVector
      val data2 = data.map { case X2(X2(_, b), _) => b }

      dataset2 ?= data2
    }

    check(forAll(prop[Int, String, Double] _))
  }

  test("select with column expression addition") {
    def prop[A](data: Vector[X1[A]], const: A)(
      implicit
      eabc: TypedEncoder[X1[A]],
      anum: CatalystNumeric[A],
      num: Numeric[A],
      eb: TypedEncoder[A]
    ): Prop = {
      val ds = TypedDataset.create(data)

      val dataset2 = ds.select(ds('a) + const).collect().run().toVector
      val data2 = data.map { case X1(a) => num.plus(a, const) }

      dataset2 ?= data2
    }

    check(forAll(prop[Short] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
  }

  test("select with column expression multiplication") {
    def prop[A](data: Vector[X1[A]], const: A)(
      implicit
      eabc: TypedEncoder[X1[A]],
      anum: CatalystNumeric[A],
      num: Numeric[A],
      eb: TypedEncoder[A]
    ): Prop = {
      val ds = TypedDataset.create(data)

      val dataset2 = ds.select(ds('a) * const).collect().run().toVector
      val data2 = data.map { case X1(a) => num.times(a, const) }

      dataset2 ?= data2
    }

    check(forAll(prop[Short] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
  }

  test("select with column expression subtraction") {
    def prop[A](data: Vector[X1[A]], const: A)(
      implicit
      eabc: TypedEncoder[X1[A]],
      cnum: CatalystNumeric[A],
      num: Numeric[A],
      eb: TypedEncoder[A]
    ): Prop = {
      val ds = TypedDataset.create(data)

      val dataset2 = ds.select(ds('a) - const).collect().run().toVector
      val data2 = data.map { case X1(a) => num.minus(a, const) }

      dataset2 ?= data2
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
  }

  test("select with column expression division") {
    def prop[A](data: Vector[X1[A]], const: A)(
      implicit
      eabc: TypedEncoder[X1[A]],
      anum: CatalystNumeric[A],
      frac: Fractional[A],
      eb: TypedEncoder[A]
    ): Prop = {
      val ds = TypedDataset.create(data)

      if (const != 0) {
        val dataset2 = ds.select(ds('a) / const).collect().run().toVector.asInstanceOf[Vector[A]]
        val data2 = data.map { case X1(a) => frac.div(a, const) }
        dataset2 ?= data2
      } else 0 ?= 0
    }

    check(forAll(prop[Double] _))
  }

  test("tests to cover problematic dataframe column names during projections") {
    case class Foo(i: Int)
    val e = TypedDataset.create[Foo](Foo(1) :: Nil)
    val t: TypedDataset[(Int, Int)] = e.select(e.col('i) * 2, e.col('i))
    assert(t.select(t.col('_1)).collect().run().toList === List(2))
    // Issue #54
    val fooT = t.select(t.col('_1)).deserialized.map(x => Tuple1.apply(x)).as[Foo]
    assert(fooT.select(fooT('i)).collect().run().toList === List(2))
  }

  test("unary - on arithmetic") {
    val e = TypedDataset.create[(Int, String, Int)]((1, "a", 2) :: (2, "b", 4) :: (2, "b", 1) :: Nil)
    assert(e.select(-e('_1)).collect().run().toVector === Vector(-1, -2, -2))
    assert(e.select(-(e('_1) + e('_3))).collect().run().toVector === Vector(-3, -6, -3))
  }

  test("unary - on strings should not type check") {
    val e = TypedDataset.create[(Int, String, Long)]((1, "a", 2L) :: (2, "b", 4L) :: (2, "b", 1L) :: Nil)
    illTyped("""e.select( -e('_2) )""")
  }

  test("select with aggregation operations is not supported") {
    val e = TypedDataset.create[(Int, String, Long)]((1, "a", 2L) :: (2, "b", 4L) :: (2, "b", 1L) :: Nil)
    illTyped("""e.select(frameless.functions.aggregate.sum(e('_1)))""")
  }
}


================================================
FILE: dataset/src/test/scala/frameless/SelfJoinTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import org.apache.spark.sql.{SparkSession, functions => sparkFunctions}

class SelfJoinTests extends TypedDatasetSuite {
  // Without crossJoin.enabled=true Spark doesn't like trivial join conditions:
  // [error] Join condition is missing or trivial.
  // [error] Use the CROSS JOIN syntax to allow cartesian products between these relations.
  def allowTrivialJoin[T](body: => T)(implicit session: SparkSession): T = {
    val crossJoin = "spark.sql.crossJoin.enabled"
    val oldSetting = session.conf.get(crossJoin)
    session.conf.set(crossJoin, "true")
    val result = body
    session.conf.set(crossJoin, oldSetting)
    result
  }

  def allowAmbiguousJoin[T](body: => T)(implicit session: SparkSession): T = {
    val crossJoin = "spark.sql.analyzer.failAmbiguousSelfJoin"
    val oldSetting = session.conf.get(crossJoin)
    session.conf.set(crossJoin, "false")
    val result = body
    session.conf.set(crossJoin, oldSetting)
    result
  }

  test("self join with colLeft/colRight disambiguation") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering
    ](dx: List[X2[A, B]], d: X2[A, B]): Prop = allowAmbiguousJoin {
      val data = d :: dx
      val ds = TypedDataset.create(data)

      // This is the way to write unambiguous self-join in vanilla, see https://goo.gl/XnkSUD
      val df1 = ds.dataset.as("df1")
      val df2 = ds.dataset.as("df2")
      val vanilla = df1.join(df2,
        sparkFunctions.col("df1.a") === sparkFunctions.col("df2.a")).count()

      val typed = ds.joinInner(ds)(
        ds.colLeft('a) === ds.colRight('a)
      ).count().run()

      vanilla ?= typed
    }

    check(prop[Int, Int] _)
  }

  test("trivial self join") {
    def prop[
      A : TypedEncoder : Ordering,
      B : TypedEncoder : Ordering
    ](dx: List[X2[A, B]], d: X2[A, B]): Prop =
      allowTrivialJoin { allowAmbiguousJoin {

        val data = d :: dx
        val ds = TypedDataset.create(data)
        val untyped = ds.dataset
        // Interestingly, even with aliasing it seems that it's impossible to
        // obtain a trivial join condition of shape df1.a == df1.a, Spark we
        // always interpret that as df1.a == df2.a. For the purpose of this
        // test we fall-back to lit(true) instead.
        // val trivial = sparkFunctions.col("df1.a") === sparkFunctions.col("df1.a")
        val trivial = sparkFunctions.lit(true)
        val vanilla = untyped.as("df1").join(untyped.as("df2"), trivial).count()

        val typed = ds.joinInner(ds)(ds.colLeft('a) === ds.colLeft('a)).count().run
        vanilla ?= typed
      } }

    check(prop[Int, Int] _)
  }

  test("self join with unambiguous expression") {
    def prop[
      A : TypedEncoder : CatalystNumeric : Ordering,
      B : TypedEncoder : Ordering
    ](data: List[X3[A, A, B]]): Prop = allowAmbiguousJoin {
      val ds = TypedDataset.create(data)

      val df1 = ds.dataset.alias("df1")
      val df2 = ds.dataset.alias("df2")

      val vanilla = df1.join(df2,
        (sparkFunctions.col("df1.a") + sparkFunctions.col("df1.b")) ===
        (sparkFunctions.col("df2.a") + sparkFunctions.col("df2.b"))).count()

      val typed = ds.joinInner(ds)(
        (ds.colLeft('a) + ds.colLeft('b)) === (ds.colRight('a) + ds.colRight('b))
      ).count().run()

      vanilla ?= typed
    }

    check(prop[Int, Int] _)
  }

  test("Do you want ambiguous self join? This is how you get ambiguous self join.") {
    def prop[
      A : TypedEncoder : CatalystNumeric : Ordering,
      B : TypedEncoder : Ordering
    ](data: List[X3[A, A, B]]): Prop =
      allowTrivialJoin { allowAmbiguousJoin {
        val ds = TypedDataset.create(data)

        // The point I'm making here is that it "behaves just like Spark". I
        // don't know (or really care about how) how Spark disambiguates that
        // internally...
        val vanilla = ds.dataset.join(ds.dataset,
          (ds.dataset("a") + ds.dataset("b")) ===
          (ds.dataset("a") + ds.dataset("b"))).count()

        val typed = ds.joinInner(ds)(
          (ds.col('a) + ds.col('b)) === (ds.col('a) + ds.col('b))
        ).count().run()

        vanilla ?= typed
      } }

      check(prop[Int, Int] _)
    }

  test("colLeft and colRight are equivalent to col outside of joins") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      ex4: TypedEncoder[X4[A, B, C, D]]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val selectedCol      = dataset.select(dataset.col     [A]('a)).collect().run().toVector
      val selectedColLeft  = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector
      val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector

      (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight)
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[X2[Int, Int], Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[UdtEncodedClass, Int, Int, Int] _))
  }

  test("colLeft and colRight are equivalent to col outside of joins - via files (codegen)") {
    def prop[A, B, C, D](data: Vector[X4[A, B, C, D]])(
      implicit
      ea: TypedEncoder[A],
      ex4: TypedEncoder[X4[A, B, C, D]]
    ): Prop = {
      TypedDataset.create(data).write.mode("overwrite").parquet("./target/testData")
      val dataset = TypedDataset.createUnsafe[X4[A, B, C, D]](session.read.parquet("./target/testData"))
      val selectedCol      = dataset.select(dataset.col     [A]('a)).collect().run().toVector
      val selectedColLeft  = dataset.select(dataset.colLeft [A]('a)).collect().run().toVector
      val selectedColRight = dataset.select(dataset.colRight[A]('a)).collect().run().toVector

      (selectedCol ?= selectedColLeft) && (selectedCol ?= selectedColRight)
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[X2[Int, Int], Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, Int] _))
    check(forAll(prop[UdtEncodedClass, Int, Int, Int] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/TypedDatasetSuite.scala
================================================
package frameless

import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem
import org.apache.hadoop.fs.local.StreamingFS
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.scalactic.anyvals.PosZInt
import org.scalatest.BeforeAndAfterAll
import org.scalatestplus.scalacheck.Checkers
import org.scalacheck.Prop
import org.scalacheck.Prop._

import scala.util.{Properties, Try}
import org.scalatest.funsuite.AnyFunSuite

trait SparkTesting { self: BeforeAndAfterAll =>

  val appID: String = new java.util.Date().toString + math.floor(math.random * 10E4).toLong.toString

  /**
   * Allows bare naked to be used instead of winutils for testing / dev
   */
  def registerFS(sparkConf: SparkConf): SparkConf = {
    if (System.getProperty("os.name").startsWith("Windows"))
      sparkConf.set("spark.hadoop.fs.file.impl", classOf[BareLocalFileSystem].getName).
        set("spark.hadoop.fs.AbstractFileSystem.file.impl", classOf[StreamingFS].getName)
    else
      sparkConf
  }

  val conf: SparkConf = registerFS(new SparkConf())
    .setMaster("local[*]")
    .setAppName("test")
    .set("spark.ui.enabled", "false")
    .set("spark.app.id", appID)

  private var s: SparkSession = _

  implicit def session: SparkSession = s
  implicit def sc: SparkContext = session.sparkContext
  implicit def sqlContext: SQLContext = session.sqlContext

  def registerOptimizations(sqlContext: SQLContext): Unit = { }

  def addSparkConfigProperties(config: SparkConf): Unit = { }

  override def beforeAll(): Unit = {
    assert(s == null)
    addSparkConfigProperties(conf)
    s = SparkSession.builder().config(conf).getOrCreate()
    registerOptimizations(sqlContext)
  }

  override def afterAll(): Unit = {
    if (s != null) {
      s.stop()
      s = null
    }
  }
}


class TypedDatasetSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll with SparkTesting {
  // Limit size of generated collections and number of checks to avoid OutOfMemoryError
  implicit override val generatorDrivenConfig: PropertyCheckConfiguration = {
    def getPosZInt(name: String, default: PosZInt) = Properties.envOrNone(s"FRAMELESS_GEN_${name}")
      .flatMap(s => Try(s.toInt).toOption)
      .flatMap(PosZInt.from)
      .getOrElse(default)
    PropertyCheckConfiguration(
      sizeRange = getPosZInt("SIZE_RANGE", PosZInt(20)),
      minSize = getPosZInt("MIN_SIZE", PosZInt(0))
    )
  }

  implicit val sparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob

  def approximatelyEqual[A](a: A, b: A)(implicit numeric: Numeric[A]): Prop = {
    val da = numeric.toDouble(a)
    val db = numeric.toDouble(b)
    val epsilon = 1E-6
    // Spark has a weird behaviour concerning expressions that should return Inf
    // Most of the time they return NaN instead, for instance stddev of Seq(-7.827553978923477E227, -5.009124275715786E153)
    if((da.isNaN || da.isInfinity) && (db.isNaN || db.isInfinity)) proved
    else if (
      (da - db).abs < epsilon ||
      (da - db).abs < da.abs / 100)
        proved
    else falsified :| s"Expected $a but got $b, which is more than 1% off and greater than epsilon = $epsilon."
  }
}


================================================
FILE: dataset/src/test/scala/frameless/UdtEncodedClass.scala
================================================
package frameless

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.sql.FramelessInternals.UserDefinedType

@SQLUserDefinedType(udt = classOf[UdtEncodedClassUdt])
class UdtEncodedClass(val a: Int, val b: Array[Double]) {
  override def equals(other: Any): Boolean = other match {
    case that: UdtEncodedClass => a == that.a && java.util.Arrays.equals(b, that.b)
    case _ => false
  }

  override def hashCode(): Int = {
    val state = Seq[Any](a, b)
    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
  }

  override def toString = s"UdtEncodedClass($a, $b)"
}

object UdtEncodedClass {
  implicit val udtForUdtEncodedClass = new UdtEncodedClassUdt
}

class UdtEncodedClassUdt extends UserDefinedType[UdtEncodedClass] {
  def sqlType: DataType = {
    StructType(Seq(
      StructField("a", IntegerType, nullable = false),
      StructField("b", ArrayType(DoubleType, containsNull = false), nullable = false)
    ))
  }

  def serialize(obj: UdtEncodedClass): InternalRow = {
    val row = new GenericInternalRow(3)
    row.setInt(0, obj.a)
    row.update(1, UnsafeArrayData.fromPrimitiveArray(obj.b))
    row
  }

  def deserialize(datum: Any): UdtEncodedClass = datum match {
    case row: InternalRow => new UdtEncodedClass(row.getInt(0), row.getArray(1).toDoubleArray())
  }

  def userClass: Class[UdtEncodedClass] = classOf[UdtEncodedClass]
}


================================================
FILE: dataset/src/test/scala/frameless/WithColumnTest.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped

class WithColumnTest extends TypedDatasetSuite {
  import WithColumnTest._

  test("fail to compile on missing value") {
    val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XMissing] = f.withColumn[XMissing](f('j) === 10)"""
    }
  }

  test("fail to compile on different column name") {
    val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XDifferentColumnName] = f.withColumn[XDifferentColumnName](f('j) === 10)"""
    }
  }

  test("fail to compile on added column name") {
    val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XAdded] = f.withColumn[XAdded](f('j) === 10)"""
    }
  }

  test("fail to compile on wrong typed column") {
    val f: TypedDataset[X] = TypedDataset.create(X(1,1) :: X(1,1) :: X(1,10) :: Nil)
    illTyped {
      """val fNew: TypedDataset[XWrongType] = f.withColumn[XWrongType](f('j) === 10)"""
    }
  }

  test("append four columns") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d = TypedDataset.create(X1(value) :: Nil)
      val d1 = d.withColumn[X2[A, A]](d('a))
      val d2 = d1.withColumn[X3[A, A, A]](d1('b))
      val d3 = d2.withColumn[X4[A, A, A, A]](d2('c))
      val d4 = d3.withColumn[X5[A, A, A, A, A]](d3('d))

      X5(value, value, value, value, value) ?= d4.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }

  test("update in place") {
    def prop[A : TypedEncoder](startValue: A, replaceValue: A): Prop = {
      val d = TypedDataset.create(X2(startValue, replaceValue) :: Nil)

      val X2(a, b) = d.withColumnReplaced('a, d('b))
        .collect()
        .run()
        .head

      a ?= b
    }
    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }
}

object WithColumnTest {
  case class X(i: Int, j: Int)
  case class XMissing(i: Int, k: Boolean)
  case class XDifferentColumnName(i: Int, ji: Int, k: Boolean)
  case class XAdded(i: Int, j: Int, k: Boolean, l: Int)
  case class XWrongType(i: Int, j: Int, k: Int)
  case class XGood(i: Int, j: Int, k: Boolean)
}


================================================
FILE: dataset/src/test/scala/frameless/WithColumnTupledTest.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class WithColumnTupledTest extends TypedDatasetSuite {
  test("append five columns") {
    def prop[A: TypedEncoder](value: A): Prop = {
      val d = TypedDataset.create(X1(value) :: Nil)
      val d1 = d.withColumnTupled(d('a))
      val d2 = d1.withColumnTupled(d1('_1))
      val d3 = d2.withColumnTupled(d2('_2))
      val d4 = d3.withColumnTupled(d3('_3))
      val d5 = d4.withColumnTupled(d4('_4))

      (value, value, value, value, value, value) ?= d5.collect().run().head
    }

    check(prop[Int] _)
    check(prop[Long] _)
    check(prop[String] _)
    check(prop[SQLDate] _)
    check(prop[Option[X1[Boolean]]] _)
  }
}


================================================
FILE: dataset/src/test/scala/frameless/XN.scala
================================================
package frameless

import org.scalacheck.{Arbitrary, Cogen}

case class X1[A](a: A)

object X1 {
  implicit def arbitrary[A: Arbitrary]: Arbitrary[X1[A]] =
    Arbitrary(implicitly[Arbitrary[A]].arbitrary.map(X1(_)))

  implicit def cogen[A](implicit A: Cogen[A]): Cogen[X1[A]] =
    A.contramap(_.a)

  implicit def ordering[A: Ordering]: Ordering[X1[A]] = Ordering[A].on(_.a)
}

case class X2[A, B](a: A, b: B)

object X2 {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary]: Arbitrary[X2[A, B]] =
    Arbitrary(Arbitrary.arbTuple2[A, B].arbitrary.map((X2.apply[A, B] _).tupled))

  implicit def cogen[A, B](implicit A: Cogen[A], B: Cogen[B]): Cogen[X2[A, B]] =
    Cogen.tuple2(A, B).contramap(x => (x.a, x.b))

  implicit def ordering[A: Ordering, B: Ordering]: Ordering[X2[A, B]] = Ordering.Tuple2[A, B].on(x => (x.a, x.b))
}

case class X3[A, B, C](a: A, b: B, c: C)

object X3 {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3[A, B, C]] =
    Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map((X3.apply[A, B, C] _).tupled))

  implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3[A, B, C]] =
    Cogen.tuple3(A, B, C).contramap(x => (x.a, x.b, x.c))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3[A, B, C]] =
    Ordering.Tuple3[A, B, C].on(x => (x.a, x.b, x.c))
}

case class X3U[A, B, C](a: A, b: B, u: Unit, c: C)

object X3U {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3U[A, B, C]] =
    Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map(x => X3U[A, B, C](x._1, x._2, (), x._3)))

  implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3U[A, B, C]] =
    Cogen.tuple3(A, B, C).contramap(x => (x.a, x.b, x.c))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3U[A, B, C]] =
    Ordering.Tuple3[A, B, C].on(x => (x.a, x.b, x.c))
}

case class X3KV[A, B, C](key: A, value: B, c: C)

object X3KV {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary]: Arbitrary[X3KV[A, B, C]] =
    Arbitrary(Arbitrary.arbTuple3[A, B, C].arbitrary.map((X3KV.apply[A, B, C] _).tupled))

  implicit def cogen[A, B, C](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C]): Cogen[X3KV[A, B, C]] =
    Cogen.tuple3(A, B, C).contramap(x => (x.key, x.value, x.c))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering]: Ordering[X3KV[A, B, C]] =
    Ordering.Tuple3[A, B, C].on(x => (x.key, x.value, x.c))
}

case class X4[A, B, C, D](a: A, b: B, c: C, d: D)

object X4 {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary]: Arbitrary[X4[A, B, C, D]] =
    Arbitrary(Arbitrary.arbTuple4[A, B, C, D].arbitrary.map((X4.apply[A, B, C, D] _).tupled))

  implicit def cogen[A, B, C, D](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D]): Cogen[X4[A, B, C, D]] =
    Cogen.tuple4(A, B, C, D).contramap(x => (x.a, x.b, x.c, x.d))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering]: Ordering[X4[A, B, C, D]] =
    Ordering.Tuple4[A, B, C, D].on(x => (x.a, x.b, x.c, x.d))
}

case class X5[A, B, C, D, E](a: A, b: B, c: C, d: D, e: E)

object X5 {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary, E: Arbitrary]: Arbitrary[X5[A, B, C, D, E]] =
    Arbitrary(Arbitrary.arbTuple5[A, B, C, D, E].arbitrary.map((X5.apply[A, B, C, D, E] _).tupled))

  implicit def cogen[A, B, C, D, E](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D], E: Cogen[E]): Cogen[X5[A, B, C, D, E]] =
    Cogen.tuple5(A, B, C, D, E).contramap(x => (x.a, x.b, x.c, x.d, x.e))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering, E: Ordering]: Ordering[X5[A, B, C, D, E]] =
    Ordering.Tuple5[A, B, C, D, E].on(x => (x.a, x.b, x.c, x.d, x.e))
}

case class X6[A, B, C, D, E, F](a: A, b: B, c: C, d: D, e: E, f: F)

object X6 {
  implicit def arbitrary[A: Arbitrary, B: Arbitrary, C: Arbitrary, D: Arbitrary, E: Arbitrary, F: Arbitrary]: Arbitrary[X6[A, B, C, D, E, F]] =
    Arbitrary(Arbitrary.arbTuple6[A, B, C, D, E, F].arbitrary.map((X6.apply[A, B, C, D, E, F] _).tupled))

  implicit def cogen[A, B, C, D, E, F](implicit A: Cogen[A], B: Cogen[B], C: Cogen[C], D: Cogen[D], E: Cogen[E], F: Cogen[F]): Cogen[X6[A, B, C, D, E, F]] =
    Cogen.tuple6(A, B, C, D, E, F).contramap(x => (x.a, x.b, x.c, x.d, x.e, x.f))

  implicit def ordering[A: Ordering, B: Ordering, C: Ordering, D: Ordering, E: Ordering, F: Ordering]: Ordering[X6[A, B, C, D, E, F]] =
    Ordering.Tuple6[A, B, C, D, E, F].on(x => (x.a, x.b, x.c, x.d, x.e, x.f))
}

================================================
FILE: dataset/src/test/scala/frameless/forward/CheckpointTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop.{forAll, _}


class CheckpointTests extends TypedDatasetSuite {
  test("checkpoint") {
    def prop[A: TypedEncoder](data: Vector[A], isEager: Boolean): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.sparkSession.sparkContext.setCheckpointDir(TEST_OUTPUT_DIR)

      dataset.checkpoint(isEager).run().queryExecution.toString() =?
        dataset.dataset.checkpoint(isEager).queryExecution.toString()
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/ColumnsTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop.forAll

class ColumnsTests extends TypedDatasetSuite {
  test("columns") {
    def prop(i: Int, s: String, b: Boolean, l: Long, d: Double, by: Byte): Prop = {
      val x1 = X1(i) :: Nil
      val x2 = X2(i, s) :: Nil
      val x3 = X3(i, s, b) :: Nil
      val x4 = X4(i, s, b, l) :: Nil
      val x5 = X5(i, s, b, l, d) :: Nil
      val x6 = X6(i, s, b, l, d, by) :: Nil

      val datasets = Seq(TypedDataset.create(x1), TypedDataset.create(x2),
        TypedDataset.create(x3), TypedDataset.create(x4),
        TypedDataset.create(x5), TypedDataset.create(x6))

      Prop.all(datasets.flatMap { dataset =>
        val columns = dataset.dataset.columns
        dataset.columns.map(col =>
          Prop.propBoolean(columns contains col)
        )
      }: _*)
    }

    check(forAll(prop _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/CountTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class CountTests extends TypedDatasetSuite {
  test("count") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop =
      TypedDataset.create(data).count().run() ?= data.size.toLong

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/DistinctTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import math.Ordering

class DistinctTests extends TypedDatasetSuite {
  test("distinct") {
    // Comparison done with `.sorted` because order is not preserved by Spark for this operation.
    def prop[A: TypedEncoder : Ordering](data: Vector[A]): Prop =
      TypedDataset.create(data).distinct.collect().run().toVector.sorted ?= data.distinct.sorted

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/ExceptTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class ExceptTests extends TypedDatasetSuite {
  test("except") {
    def prop[A: TypedEncoder](data1: Set[A], data2: Set[A]): Prop = {
      val dataset1 = TypedDataset.create(data1.toSeq)
      val dataset2 = TypedDataset.create(data2.toSeq)
      val datasetSubtract = dataset1.except(dataset2).collect().run().toVector
      val dataSubtract = data1.diff(data2)

      Prop.all(
        datasetSubtract.size ?= dataSubtract.size,
        datasetSubtract.toSet ?= dataSubtract
      )
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/FirstTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers

class FirstTests extends TypedDatasetSuite with Matchers {
  test("first") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop =
      TypedDataset.create(data).firstOption().run() =? data.headOption

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("first on empty dataset should return None") {
    TypedDataset.create(Vector[Int]()).firstOption().run() shouldBe None
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/ForeachTests.scala
================================================
package frameless
package forward

import org.apache.spark.util.CollectionAccumulator

import org.scalacheck.Prop
import org.scalacheck.Prop._

import scala.collection.JavaConverters._

class ForeachTests extends TypedDatasetSuite {
  test("foreach") {
    def prop[A: Ordering: TypedEncoder](data: Vector[A]): Prop = {
      val accu = new CollectionAccumulator[A]()
      sc.register(accu)

      TypedDataset.create(data).foreach(accu.add).run()

      accu.value.asScala.toVector.sorted ?= data.sorted
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("foreachPartition") {
    def prop[A: Ordering: TypedEncoder](data: Vector[A]): Prop = {
      val accu = new CollectionAccumulator[A]()
      sc.register(accu)

      TypedDataset.create(data).foreachPartition(_.foreach(accu.add)).run()

      accu.value.asScala.toVector.sorted ?= data.sorted
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/HeadTests.scala
================================================
package frameless.forward

import frameless.{TypedDataset, TypedDatasetSuite, TypedEncoder, TypedExpressionEncoder, X1}
import org.apache.spark.sql.SparkSession
import org.scalacheck.Prop
import org.scalacheck.Prop._

import scala.reflect.ClassTag
import org.scalatest.matchers.should.Matchers

class HeadTests extends TypedDatasetSuite with Matchers {
  def propArray[A: TypedEncoder : ClassTag : Ordering](data: Vector[X1[A]])(implicit c: SparkSession): Prop = {
    import c.implicits._
    if(data.nonEmpty) {
      val tds = TypedDataset.
        create(c.createDataset(data)(
          TypedExpressionEncoder.apply[X1[A]]
        ).orderBy($"a".desc))
        (tds.headOption().run().get ?= data.max).
        &&(tds.head(1).run().head ?= data.max).
        &&(tds.head(4).run().toVector ?=
          data.sortBy(_.a)(implicitly[Ordering[A]].reverse).take(4))
    } else Prop.passed
  }

  test("headOption(), head(1), and head(4)") {
    check(propArray[Int] _)
    check(propArray[Char] _)
    check(propArray[String] _)
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/InputFilesTests.scala
================================================
package frameless

import java.util.UUID

import org.apache.spark.sql.SparkSession
import org.scalacheck.Prop
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers

class InputFilesTests extends TypedDatasetSuite with Matchers {
  test("inputFiles") {

    def propText[A: TypedEncoder](data: Vector[A]): Prop = {
      val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.txt"

      TypedDataset.create(data).dataset.write.text(filePath)
      val dataset = TypedDataset.create(implicitly[SparkSession].sparkContext.textFile(filePath))

      dataset.inputFiles sameElements dataset.dataset.inputFiles
    }

    def propCsv[A: TypedEncoder](data: Vector[A]): Prop = {
      val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.csv"
      val inputDataset = TypedDataset.create(data)
      inputDataset.dataset.write.csv(filePath)

      val dataset = TypedDataset.createUnsafe(
        implicitly[SparkSession].sqlContext.read.schema(inputDataset.schema).csv(filePath))

      dataset.inputFiles sameElements dataset.dataset.inputFiles
    }

    def propJson[A: TypedEncoder](data: Vector[A]): Prop = {
      val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}.json"
      val inputDataset = TypedDataset.create(data)
      inputDataset.dataset.write.json(filePath)

      val dataset = TypedDataset.createUnsafe(
        implicitly[SparkSession].sqlContext.read.schema(inputDataset.schema).json(filePath))

      dataset.inputFiles sameElements dataset.dataset.inputFiles
    }

    check(forAll(propText[String] _))
    check(forAll(propCsv[String] _))
    check(forAll(propJson[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/IntersectTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import math.Ordering

class IntersectTests extends TypedDatasetSuite {
  test("intersect") {
    def prop[A: TypedEncoder : Ordering](data1: Vector[A], data2: Vector[A]): Prop = {
      val dataset1 = TypedDataset.create(data1)
      val dataset2 = TypedDataset.create(data2)
      val datasetIntersect = dataset1.intersect(dataset2).collect().run().toVector

      // Vector `intersect` is the multiset intersection, while Spark throws away duplicates.
      val dataIntersect = data1.intersect(data2).distinct

      // Comparison done with `.sorted` because order is not preserved by Spark for this operation.
      datasetIntersect.sorted ?= dataIntersect.distinct.sorted
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/IsLocalTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class IsLocalTests extends TypedDatasetSuite {
  test("isLocal") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.isLocal ?= dataset.dataset.isLocal
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/IsStreamingTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class IsStreamingTests extends TypedDatasetSuite {
  test("isStreaming") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.isStreaming ?= dataset.dataset.isStreaming
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/LimitTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class LimitTests extends TypedDatasetSuite {
  test("limit") {
    def prop[A: TypedEncoder](data: Vector[A], n: Int): Prop = (n >= 0) ==> {
      val dataset = TypedDataset.create(data).limit(n).collect().run()

      Prop.all(
        dataset.length ?= Math.min(data.length, n),
        dataset.forall(data.contains)
      )
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/QueryExecutionTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop.{forAll, _}

class QueryExecutionTests extends TypedDatasetSuite {
  test("queryExecution") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create[A](data)

      dataset.queryExecution =? dataset.dataset.queryExecution
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/RandomSplitTests.scala
================================================
package frameless

import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen}

import scala.collection.JavaConverters._
import org.scalatest.matchers.should.Matchers

class RandomSplitTests extends TypedDatasetSuite with Matchers {

  val nonEmptyPositiveArray: Gen[Array[Double]] = Gen.nonEmptyListOf(Gen.posNum[Double]).map(_.toArray)

  test("randomSplit(weight, seed)") {
    def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], nonEmptyPositiveArray, arbitrary[Long]) {
      (data: Vector[A], weights: Array[Double], seed: Long) =>
        val dataset = TypedDataset.create(data)

        dataset.randomSplit(weights, seed).map(_.count().run()) sameElements
          dataset.dataset.randomSplit(weights, seed).map(_.count())
    }

    check(prop[Int])
    check(prop[String])
  }

  test("randomSplitAsList(weight, seed)") {
    def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], nonEmptyPositiveArray, arbitrary[Long]) {
      (data: Vector[A], weights: Array[Double], seed: Long) =>
        val dataset = TypedDataset.create(data)

        dataset.randomSplitAsList(weights, seed).asScala.map(_.count().run()) sameElements
          dataset.dataset.randomSplitAsList(weights, seed).asScala.map(_.count())
    }

    check(prop[Int])
    check(prop[String])
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/SQLContextTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop.{forAll, _}

class SQLContextTests extends TypedDatasetSuite {
  test("sqlContext") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create[A](data)

      dataset.sqlContext =? dataset.dataset.sqlContext
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/SparkSessionTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class SparkSessionTests extends TypedDatasetSuite {
  test("sparkSession") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create[A](data)

      dataset.sparkSession =? dataset.dataset.sparkSession
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/StorageLevelTests.scala
================================================
package frameless

import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel._
import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen}

class StorageLevelTests extends TypedDatasetSuite {

  val storageLevelGen: Gen[StorageLevel] = Gen.oneOf(Seq(NONE, DISK_ONLY, DISK_ONLY_2, MEMORY_ONLY,
    MEMORY_ONLY_2, MEMORY_ONLY_SER, MEMORY_ONLY_SER_2, MEMORY_AND_DISK,
    MEMORY_AND_DISK_2, MEMORY_AND_DISK_SER, MEMORY_AND_DISK_SER_2, OFF_HEAP))

  test("storageLevel") {
    def prop[A: TypedEncoder : Arbitrary] = forAll(vectorGen[A], storageLevelGen) {
      (data: Vector[A], storageLevel: StorageLevel) =>
        val dataset = TypedDataset.create(data)
        if (storageLevel != StorageLevel.NONE)
          dataset.persist(storageLevel)

        dataset.count().run()

        dataset.storageLevel() ?= dataset.dataset.storageLevel
    }

    check(prop[Int])
    check(prop[String])
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/TakeTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import scala.reflect.ClassTag

class TakeTests extends TypedDatasetSuite {
  test("take") {
    def prop[A: TypedEncoder](n: Int, data: Vector[A]): Prop =
      (n >= 0) ==> (TypedDataset.create(data).take(n).run().toVector =? data.take(n))

    def propArray[A: TypedEncoder: ClassTag](n: Int, data: Vector[X1[Array[A]]]): Prop =
      (n >= 0) ==> {
        Prop {
          TypedDataset.create(data).take(n).run().toVector.zip(data.take(n)).forall {
            case (X1(l), X1(r)) => l sameElements r
          }
        }
      }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
    check(forAll(propArray[Int] _))
    check(forAll(propArray[String] _))
    check(forAll(propArray[Byte] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/ToJSONTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._

class ToJSONTests extends TypedDatasetSuite {
  test("toJSON") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.toJSON.collect().run() ?= dataset.dataset.toJSON.collect()
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/forward/ToLocalIteratorTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import scala.collection.JavaConverters._
import org.scalatest.matchers.should.Matchers

class ToLocalIteratorTests extends TypedDatasetSuite with Matchers {
  test("toLocalIterator") {
    def prop[A: TypedEncoder](data: Vector[A]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.toLocalIterator().run().asScala.toIterator sameElements dataset.dataset.toLocalIterator().asScala.toIterator
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/UnionTests.scala
================================================
package frameless

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped

class UnionTests extends TypedDatasetSuite {

  test("fail to compile on not aligned schema") {
    val dataset1 = TypedDataset.create(Foo(1, 1) :: Nil)
    val dataset2 = TypedDataset.create(Wrong(1, 1, 1) :: Nil)

    illTyped {
      """val fNew = dataset1 union dataset2 """
    }
  }

  test("Union for simple data types") {
    def prop[A: TypedEncoder](data1: Vector[A], data2: Vector[A]): Prop = {
      val dataset1 = TypedDataset.create(data1)
      val dataset2 = TypedDataset.create(data2)
      val datasetUnion = dataset1.union(dataset2).collect().run().toVector
      val dataUnion = data1.union(data2)

      datasetUnion ?= dataUnion
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("Align fields for case classes") {
    def prop[A: TypedEncoder, B: TypedEncoder](data1: Vector[(A, B)], data2: Vector[(A, B)]): Prop = {

      val dataset1 = TypedDataset.create(data1.map((Foo.apply[A, B] _).tupled))
      val dataset2 = TypedDataset.create(data2.map { case (a, b) => Bar[A, B](b, a) })
      val datasetUnion = dataset1.union(dataset2).collect().run().map(foo => (foo.x, foo.y)).toVector
      val dataUnion = data1 union data2

      datasetUnion ?= dataUnion
    }

    check(forAll(prop[Int, String] _))
    check(forAll(prop[String, X1[Option[Long]]] _))
  }

  test("Align fields for different number of columns") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](data1: Vector[(A, B, C)], data2: Vector[(A, B)]): Prop = {

      val dataset1 = TypedDataset.create(data2.map((Foo.apply[A, B] _).tupled))
      val dataset2 = TypedDataset.create(data1.map { case (a, b, c) => Baz[A, B, C](c, b, a) })
      val datasetUnion: Seq[(A, B)] = dataset1.union(dataset2).collect().run().map(foo => (foo.x, foo.y)).toVector
      val dataUnion = data2 union data1.map { case (a, b, _) => (a, b) }

      datasetUnion ?= dataUnion
    }

    check(forAll(prop[Option[Int], String, Array[Long]] _))
    check(forAll(prop[String, X1[Option[Int]], X2[String, Array[Int]]] _))
  }
}

final case class Foo[A, B](x: A, y: B)
final case class Bar[A, B](y: B, x: A)
final case class Baz[A, B, C](z: C, y: B, x: A)
final case class Wrong[A, B, C](a: A, b: B, c: C)

================================================
FILE: dataset/src/test/scala/frameless/forward/WriteStreamTests.scala
================================================
package frameless

import java.util.UUID

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.execution.streaming.MemoryStream
import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen, Prop}

class WriteStreamTests extends TypedDatasetSuite {

  val genNested = for {
    d <- Arbitrary.arbitrary[Double]
    as <- Arbitrary.arbitrary[String]
  } yield Nested(d, as)

  val genOptionFieldsOnly = for {
    o1 <- Gen.option(Arbitrary.arbitrary[Int])
    o2 <- Gen.option(genNested)
  } yield OptionFieldsOnly(o1, o2)

  val genWriteExample = for {
    i <- Arbitrary.arbitrary[Int]
    s <- Arbitrary.arbitrary[String]
    on <- Gen.option(genNested)
    ooo <- Gen.option(genOptionFieldsOnly)
  } yield WriteExample(i, s, on, ooo)

  test("write csv") {
    val spark = session
    import spark.implicits._
    def prop[A: TypedEncoder: Encoder](data: List[A]): Prop = {
      val uid = UUID.randomUUID()
      val uidNoHyphens = uid.toString.replace("-", "")
      val filePath = s"$TEST_OUTPUT_DIR/$uid}"
      val checkpointPath = s"$TEST_OUTPUT_DIR/checkpoint/$uid"
      val inputStream = MemoryStream[A]
      val input = TypedDataset.create(inputStream.toDS())
      val inputter = input.writeStream.format("csv").option("checkpointLocation", s"$checkpointPath/input").start(filePath)
      inputStream.addData(data)
      inputter.processAllAvailable()
      val dataset = TypedDataset.createUnsafe(sqlContext.readStream.schema(input.schema).csv(filePath))

      val tester = dataset
        .writeStream
        .option("checkpointLocation", s"$checkpointPath/tester")
        .format("memory")
        .queryName(s"testCsv_$uidNoHyphens")
        .start()
      tester.processAllAvailable()
      val output = spark.table(s"testCsv_$uidNoHyphens").as[A]
      TypedDataset.create(data).collect().run().groupBy(identity) ?= output.collect().groupBy(identity).map { case  (k, arr) => (k, arr.toSeq) }
    }

    check(forAll(Gen.nonEmptyListOf(Gen.alphaNumStr.suchThat(_.nonEmpty)))(prop[String]))
    check(forAll(Gen.nonEmptyListOf(Arbitrary.arbitrary[Int]))(prop[Int]))
  }

  test("write parquet") {
    val spark = session
    import spark.implicits._
    def prop[A: TypedEncoder: Encoder](data: List[A]): Prop = {
      val uid = UUID.randomUUID()
      val uidNoHyphens = uid.toString.replace("-", "")
      val filePath = s"$TEST_OUTPUT_DIR/$uid}"
      val checkpointPath = s"$TEST_OUTPUT_DIR/checkpoint/$uid"
      val inputStream = MemoryStream[A]
      val input = TypedDataset.create(inputStream.toDS())
      val inputter = input.writeStream.format("parquet").option("checkpointLocation", s"$checkpointPath/input").start(filePath)
      inputStream.addData(data)
      inputter.processAllAvailable()
      val dataset = TypedDataset.createUnsafe(sqlContext.readStream.schema(input.schema).parquet(filePath))

      val tester = dataset
        .writeStream
        .option("checkpointLocation", s"$checkpointPath/tester")
        .format("memory")
        .queryName(s"testParquet_$uidNoHyphens")
        .start()
      tester.processAllAvailable()
      val output = spark.table(s"testParquet_$uidNoHyphens").as[A]
      TypedDataset.create(data).collect().run().groupBy(identity) ?= output.collect().groupBy(identity).map { case  (k, arr) => (k, arr.toSeq) }
    }

    check(forAll(Gen.nonEmptyListOf(genWriteExample))(prop[WriteExample]))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/forward/WriteTests.scala
================================================
package frameless

import java.util.UUID

import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen, Prop}

class WriteTests extends TypedDatasetSuite {

  val genNested = for {
    d <- Arbitrary.arbitrary[Double]
    as <- Arbitrary.arbitrary[String]
  } yield Nested(d, as)

  val genOptionFieldsOnly = for {
    o1 <- Gen.option(Arbitrary.arbitrary[Int])
    o2 <- Gen.option(genNested)
  } yield OptionFieldsOnly(o1, o2)

  val genWriteExample = for {
    i <- Arbitrary.arbitrary[Int]
    s <- Arbitrary.arbitrary[String]
    on <- Gen.option(genNested)
    ooo <- Gen.option(genOptionFieldsOnly)
  } yield WriteExample(i, s, on, ooo)

  test("write csv") {
    def prop[A: TypedEncoder](data: List[A]): Prop = {
      val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}"
      val input = TypedDataset.create(data)
      input.write.csv(filePath)

      val dataset = TypedDataset.createUnsafe(sqlContext.read.schema(input.schema).csv(filePath))

      dataset.collect().run().groupBy(identity) ?= input.collect().run().groupBy(identity)
    }

    check(forAll(Gen.listOf(Gen.alphaNumStr.suchThat(_.nonEmpty)))(prop[String]))
    check(forAll(prop[Int] _))
  }

  test("write parquet") {
    def prop[A: TypedEncoder](data: List[A]): Prop = {
      val filePath = s"$TEST_OUTPUT_DIR/${UUID.randomUUID()}"
      val input = TypedDataset.create(data)
      input.write.parquet(filePath)

      val dataset = TypedDataset.createUnsafe(sqlContext.read.schema(input.schema).parquet(filePath))

      dataset.collect().run().groupBy(identity) ?= input.collect().run().groupBy(identity)
    }

    check(forAll(Gen.listOf(genWriteExample))(prop[WriteExample]))
  }
}

case class Nested(i: Double, v: String)
case class OptionFieldsOnly(o1: Option[Int], o2: Option[Nested])
case class WriteExample(i: Int, s: String, on: Option[Nested], ooo: Option[OptionFieldsOnly])


================================================
FILE: dataset/src/test/scala/frameless/functions/AggregateFunctionsTests.scala
================================================
package frameless
package functions

import frameless.{TypedAggregate, TypedColumn}
import frameless.functions.aggregate._
import org.apache.spark.sql.{Column, Encoder}
import org.scalacheck.{Gen, Prop}
import org.scalacheck.Prop._
import org.scalatest.exceptions.GeneratorDrivenPropertyCheckFailedException

class AggregateFunctionsTests extends TypedDatasetSuite {
  def sparkSchema[A: TypedEncoder, U](f: TypedColumn[X1[A], A] => TypedAggregate[X1[A], U]): Prop = {
    val df = TypedDataset.create[X1[A]](Nil)
    val col = f(df.col('a))

    val sumDf = df.agg(col)

    TypedExpressionEncoder.targetStructType(sumDf.encoder) ?= sumDf.dataset.schema
  }

  test("sum") {
    case class Sum4Tests[A, B](sum: Seq[A] => B)

    def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])(
      implicit
      summable: CatalystSummable[A, Out],
      summer: Sum4Tests[A, Out]
    ): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetSum: List[Out] = dataset.agg(sum(A)).collect().run().toList

      datasetSum match {
        case x :: Nil => approximatelyEqual(summer.sum(xs), x)
        case other => falsified
      }
    }

    // Replicate Spark's behaviour : Ints and Shorts are cast to Long
    // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L37
    implicit def summerDecimal = Sum4Tests[BigDecimal, BigDecimal](_.sum)
    implicit def summerDouble = Sum4Tests[Double, Double](_.sum)
    implicit def summerLong = Sum4Tests[Long, Long](_.sum)
    implicit def summerInt = Sum4Tests[Int, Long](_.map(_.toLong).sum)
    implicit def summerShort = Sum4Tests[Short, Long](_.map(_.toLong).sum)

    check(forAll(prop[BigDecimal, BigDecimal] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[Double, Double] _))
    check(forAll(prop[Int, Long] _))
    check(forAll(prop[Short, Long] _))

    check(sparkSchema[BigDecimal, BigDecimal](sum))
    check(sparkSchema[Long, Long](sum))
    check(sparkSchema[Int, Long](sum))
    check(sparkSchema[Double, Double](sum))
    check(sparkSchema[Short, Long](sum))
  }

  test("sumDistinct") {
    case class Sum4Tests[A, B](sum: Seq[A] => B)

    def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])(
      implicit
      summable: CatalystSummable[A, Out],
      summer: Sum4Tests[A, Out]
    ): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetSum: List[Out] = dataset.agg(sumDistinct(A)).collect().run().toList

      datasetSum match {
        case x :: Nil => approximatelyEqual(summer.sum(xs), x)
        case other => falsified
      }
    }

    // Replicate Spark's behaviour : Ints and Shorts are cast to Long
    // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L37
    implicit def summerLong = Sum4Tests[Long, Long](_.toSet.sum)
    implicit def summerInt = Sum4Tests[Int, Long]( x => x.toSet.map((_:Int).toLong).sum)
    implicit def summerShort = Sum4Tests[Short, Long](x => x.toSet.map((_:Short).toLong).sum)

    check(forAll(prop[Long, Long] _))
    check(forAll(prop[Int, Long] _))
    check(forAll(prop[Short, Long] _))

    check(sparkSchema[Long, Long](sum))
    check(sparkSchema[Int, Long](sum))
    check(sparkSchema[Short, Long](sum))
  }

  test("avg") {
    case class Averager4Tests[A, B](avg: Seq[A] => B)

    def prop[A: TypedEncoder, Out: TypedEncoder : Numeric](xs: List[A])(
      implicit
      averageable: CatalystAverageable[A, Out],
      averager: Averager4Tests[A, Out]
    ): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetAvg: Vector[Out] = dataset.agg(avg(A)).collect().run().toVector

      if (datasetAvg.size > 2) falsified
      else xs match {
        case Nil => datasetAvg ?= Vector()
        case _ :: _ => datasetAvg.headOption match {
          case Some(x) => approximatelyEqual(averager.avg(xs), x)
          case None => falsified
        }
      }
    }

    // Replicate Spark's behaviour : If the datatype isn't BigDecimal cast type to Double
    // https://github.com/apache/spark/blob/7eb2ca8/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala#L50
    implicit def averageDecimal = Averager4Tests[BigDecimal, BigDecimal](as => as.sum/as.size)
    implicit def averageDouble = Averager4Tests[Double, Double](as => as.sum/as.size)
    implicit def averageLong = Averager4Tests[Long, Double](as => as.map(_.toDouble).sum/as.size)
    implicit def averageInt = Averager4Tests[Int, Double](as => as.map(_.toDouble).sum/as.size)
    implicit def averageShort = Averager4Tests[Short, Double](as => as.map(_.toDouble).sum/as.size)

    /* under 3.4 an oddity was detected:
    Falsified after 2 successful property evaluations.
    Location: (AggregateFunctionsTests.scala:127)
    [info]     Occurred when passed generated values (
    [info]       arg0 = List("-1", "9223372036854775807", "-9223372036854775808")
    [info]     )
    which is odd given it's strings and not the Long's that should have been there, but also not seemingly reproducible with just longs
     */
    tolerantRun(_.isInstanceOf[GeneratorDrivenPropertyCheckFailedException]) {
      check(forAll(prop[BigDecimal, BigDecimal] _))
      check(forAll(prop[Double, Double] _))
      check(forAll(prop[Long, Double] _))
      check(forAll(prop[Int, Double] _))
      check(forAll(prop[Short, Double] _))
    }
  }

  test("stddev and variance") {
    def prop[A: TypedEncoder : CatalystVariance : Numeric](xs: List[A]): Prop = {
      val numeric = implicitly[Numeric[A]]
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetStdOpt = dataset.agg(stddev(A)).collect().run().toVector.headOption
      val datasetVarOpt = dataset.agg(variance(A)).collect().run().toVector.headOption

      val std = sc.parallelize(xs.map(implicitly[Numeric[A]].toDouble)).sampleStdev()
      val `var` = sc.parallelize(xs.map(implicitly[Numeric[A]].toDouble)).sampleVariance()

      (datasetStdOpt, datasetVarOpt) match {
        case (Some(datasetStd), Some(datasetVar)) =>
          approximatelyEqual(datasetStd, std) && approximatelyEqual(datasetVar, `var`)
        case _ => proved
      }
    }

    check(forAll(prop[Short] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Double] _))
  }

  test("litAggr") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](xs: List[A], b: B, c: C): Prop = {
      val dataset = TypedDataset.create(xs)
      val (r1, rb, rc, rcount) = dataset.agg(count().lit(1), litAggr(b), litAggr(c), count()).collect().run().head
      (rcount ?= xs.size.toLong) && (r1 ?= 1) && (rb ?= b) && (rc ?= c)
    }

    check(forAll(prop[Boolean, Int, String] _))
    check(forAll(prop[Option[Boolean], Vector[Option[Vector[Char]]], Long] _))
  }

  test("count") {
    def prop[A: TypedEncoder](xs: List[A]): Prop = {
      val dataset = TypedDataset.create(xs)
      val Vector(datasetCount) = dataset.agg(count()).collect().run().toVector

      datasetCount ?= xs.size.toLong
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Byte] _))
  }

  test("count('a)") {
    def prop[A: TypedEncoder](xs: List[A]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)
      val datasetCount = dataset.agg(count(A)).collect().run()

      datasetCount ?= List(xs.size.toLong)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Byte] _))
  }

  test("max") {
    def prop[A: TypedEncoder: CatalystOrdered](xs: List[A])(implicit o: Ordering[A]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)
      val datasetMax = dataset.agg(max(A)).collect().run().toList

      datasetMax ?= xs.reduceOption[A](o.max).toList
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  test("max with follow up multiplication") {
    def prop(xs: List[Long]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[Long]('a)
      val datasetMax = dataset.agg(max(A) * 2).collect().run().headOption

      datasetMax ?= (if(xs.isEmpty) None else Some(xs.max * 2))
    }

    check(forAll(prop _))
  }

  test("min") {
    def prop[A: TypedEncoder: CatalystOrdered](xs: List[A])(implicit o: Ordering[A]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetMin = dataset.agg(min(A)).collect().run().toList

      datasetMin ?= xs.reduceOption[A](o.min).toList
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  test("first") {
    def prop[A: TypedEncoder](xs: List[A]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetFirst = dataset.agg(first(A)).collect().run().toList

      datasetFirst ?= xs.headOption.toList
    }

    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  test("last") {
    def prop[A: TypedEncoder](xs: List[A]): Prop = {
      val dataset = TypedDataset.create(xs.map(X1(_)))
      val A = dataset.col[A]('a)

      val datasetLast = dataset.agg(last(A)).collect().run().toList

      datasetLast ?= xs.lastOption.toList
    }

    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  // Generator for simplified and focused aggregation examples
  def getLowCardinalityKVPairs: Gen[Vector[(Int, Int)]] = {
    val kvPairGen: Gen[(Int, Int)] = for {
      k <- Gen.const(1) // key
      v <- Gen.choose(10, 100) // values
    } yield (k, v)

    Gen.listOfN(200, kvPairGen).map(_.toVector)
  }

  test("countDistinct") {
    check {
      forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] =>
        val tds = TypedDataset.create(xs)
        val tdsRes: Seq[(Int, Long)] = tds.groupBy(tds('_1)).agg(countDistinct(tds('_2))).collect().run()
        tdsRes.toMap ?= xs.groupBy(_._1).mapValues(_.map(_._2).distinct.size.toLong).toSeq.toMap
      }
    }
  }

  test("approxCountDistinct") {
    // Simple version of #approximatelyEqual()
    // Default maximum estimation error of HyperLogLog in Spark is 5%
    def approxEqual(actual: Long, estimated: Long, allowedDeviationPercentile: Double = 0.05): Boolean = {
      val delta: Long = Math.abs(actual - estimated)
      delta / actual.toDouble < allowedDeviationPercentile * 2
    }

    check {
      forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] =>
        val tds = TypedDataset.create(xs)
        val tdsRes: Seq[(Int, Long, Long)] =
          tds.groupBy(tds('_1)).agg(countDistinct(tds('_2)), approxCountDistinct(tds('_2))).collect().run()
        tdsRes.forall { case (_, v1, v2) => approxEqual(v1, v2) }
      }
    }

    check {
      forAll(getLowCardinalityKVPairs) { xs: Vector[(Int, Int)] =>
        val tds = TypedDataset.create(xs)
        val allowedError = 0.1 // 10%
        val tdsRes: Seq[(Int, Long, Long)] =
          tds.groupBy(tds('_1)).agg(countDistinct(tds('_2)), approxCountDistinct(tds('_2), allowedError)).collect().run()
        tdsRes.forall { case (_, v1, v2) => approxEqual(v1, v2, allowedError) }
      }
    }
  }

  test("collectList") {
    def prop[A: TypedEncoder : Ordering](xs: List[X2[A, A]]): Prop = {
      val tds = TypedDataset.create(xs)
      val tdsRes: Seq[(A, Vector[A])] = tds.groupBy(tds('a)).agg(collectList(tds('b))).collect().run()

      tdsRes.toMap.map { case (k, v) => k -> v.sorted } ?= xs.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).toVector.sorted }
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  test("collectSet") {
    def prop[A: TypedEncoder : Ordering](xs: List[X2[A, A]]): Prop = {
      val tds = TypedDataset.create(xs)
      val tdsRes: Seq[(A, Vector[A])] = tds.groupBy(tds('a)).agg(collectSet(tds('b))).collect().run()

      tdsRes.toMap.map { case (k, v) => k -> v.toSet } ?= xs.groupBy(_.a).map { case (k, v) => k -> v.map(_.b).toSet }
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[String] _))
  }

  test("lit") {
    def prop[A: TypedEncoder](xs: List[X1[A]], l: A): Prop = {
      val tds = TypedDataset.create(xs)
      tds.select(tds('a), lit(l)).collect().run() ?= xs.map(x => (x.a, l))
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Vector[Vector[Int]]] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Vector[Byte]] _))
    check(forAll(prop[String] _))
    check(forAll(prop[Vector[Long]] _))
    check(forAll(prop[BigDecimal] _))
  }


  def bivariatePropTemplate[A: TypedEncoder, B: TypedEncoder]
  (
    xs: List[X3[Int, A, B]]
  )
  (
    framelessFun: (TypedColumn[X3[Int, A, B], A], TypedColumn[X3[Int, A, B], B]) => TypedAggregate[X3[Int, A, B], Option[Double]],
    sparkFun: (Column, Column) => Column
  )
  (
    implicit
    encEv: Encoder[(Int, A, B)],
    encEv2: Encoder[(Int,Option[Double])],
    evCanBeDoubleA: CatalystCast[A, Double],
    evCanBeDoubleB: CatalystCast[B, Double]
  ): Prop = {

    val tds = TypedDataset.create(xs)
    // Typed implementation of bivar stats function
    val tdBivar = tds.groupBy(tds('a)).agg(framelessFun(tds('b), tds('c))).deserialized.map(kv =>
      (kv._1, kv._2.flatMap(DoubleBehaviourUtils.nanNullHandler))
    ).collect().run()

    val cDF = session.createDataset(xs.map(x => (x.a, x.b, x.c)))
    // Comparison implementation of bivar stats functions
    val compBivar = cDF
      .groupBy(cDF("_1"))
      .agg(sparkFun(cDF("_2"), cDF("_3")))
      .map(
        row => {
          val grp = row.getInt(0)
          (grp, DoubleBehaviourUtils.nanNullHandler(row.get(1)))
        }
      )

    // Should be the same
    tdBivar.toMap ?= compBivar.collect().toMap
  }

  def univariatePropTemplate[A: TypedEncoder]
  (
    xs: List[X2[Int, A]]
  )
  (
    framelessFun: (TypedColumn[X2[Int, A], A]) => TypedAggregate[X2[Int, A], Option[Double]],
    sparkFun: (Column) => Column
  )
  (
    implicit
    encEv: Encoder[(Int, A)],
    encEv2: Encoder[(Int,Option[Double])],
    evCanBeDoubleA: CatalystCast[A, Double]
  ): Prop = {

    val tds = TypedDataset.create(xs)
    //typed implementation of univariate stats function
    val tdUnivar = tds.groupBy(tds('a)).agg(framelessFun(tds('b))).deserialized.map(kv =>
      (kv._1, kv._2.flatMap(DoubleBehaviourUtils.nanNullHandler))
    ).collect().run()

    val cDF = session.createDataset(xs.map(x => (x.a, x.b)))
    // Comparison implementation of bivar stats functions
    val compUnivar = cDF
      .groupBy(cDF("_1"))
      .agg(sparkFun(cDF("_2")))
      .map(
        row => {
          val grp = row.getInt(0)
          (grp, DoubleBehaviourUtils.nanNullHandler(row.get(1)))
        }
      )

    // Should be the same
    tdUnivar.toMap ?= compUnivar.collect().toMap
  }

  test("corr") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])(
      implicit
      encEv: Encoder[(Int, A, B)],
      evCanBeDoubleA: CatalystCast[A, Double],
      evCanBeDoubleB: CatalystCast[B, Double]
    ): Prop = bivariatePropTemplate(xs)(corr[A,B,X3[Int, A, B]],org.apache.spark.sql.functions.corr)

    check(forAll(prop[Double, Double] _))
    check(forAll(prop[Double, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[BigDecimal, Byte] _))
  }

  test("covar_pop") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])(
      implicit
      encEv: Encoder[(Int, A, B)],
      evCanBeDoubleA: CatalystCast[A, Double],
      evCanBeDoubleB: CatalystCast[B, Double]
    ): Prop = bivariatePropTemplate(xs)(
      covarPop[A, B, X3[Int, A, B]],
      org.apache.spark.sql.functions.covar_pop
    )

    check(forAll(prop[Double, Double] _))
    check(forAll(prop[Double, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[BigDecimal, Byte] _))
  }

  test("covar_samp") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder, B: TypedEncoder](xs: List[X3[Int, A, B]])(
      implicit
      encEv: Encoder[(Int, A, B)],
      evCanBeDoubleA: CatalystCast[A, Double],
      evCanBeDoubleB: CatalystCast[B, Double]
    ): Prop = bivariatePropTemplate(xs)(
      covarSamp[A, B, X3[Int, A, B]],
      org.apache.spark.sql.functions.covar_samp
    )

    check(forAll(prop[Double, Double] _))
    check(forAll(prop[Double, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[BigDecimal, Byte] _))
  }

  test("kurtosis") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder](xs: List[X2[Int, A]])(
      implicit
      encEv: Encoder[(Int, A)],
      evCanBeDoubleA: CatalystCast[A, Double]
    ): Prop = univariatePropTemplate(xs)(
      kurtosis[A, X2[Int, A]],
      org.apache.spark.sql.functions.kurtosis
    )

    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
  }

  test("skewness") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder](xs: List[X2[Int, A]])(
      implicit
      encEv: Encoder[(Int, A)],
      evCanBeDoubleA: CatalystCast[A, Double]
    ): Prop = univariatePropTemplate(xs)(
      skewness[A, X2[Int, A]],
      org.apache.spark.sql.functions.skewness
    )

    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
  }

  test("stddev_pop") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder](xs: List[X2[Int, A]])(
      implicit
      encEv: Encoder[(Int, A)],
      evCanBeDoubleA: CatalystCast[A, Double]
    ): Prop = univariatePropTemplate(xs)(
      stddevPop[A, X2[Int, A]],
      org.apache.spark.sql.functions.stddev_pop
    )

    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
  }

  test("stddev_samp") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder](xs: List[X2[Int, A]])(
      implicit
      encEv: Encoder[(Int, A)],
      evCanBeDoubleA: CatalystCast[A, Double]
    ): Prop = univariatePropTemplate(xs)(
      stddevSamp[A, X2[Int, A]],
      org.apache.spark.sql.functions.stddev_samp
    )
    check(forAll(prop[Double] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/functions/DateTimeStringBehaviourUtils.scala
================================================
package frameless.functions

import org.apache.spark.sql.Row

object DateTimeStringBehaviourUtils {
  val nullHandler: Row => Option[Int] = _.get(0) match {
    case i: Int => Some(i)
    case _ => None
  }
}


================================================
FILE: dataset/src/test/scala/frameless/functions/DoubleBehaviourUtils.scala
================================================
package frameless
package functions

/**
  * Some statistical functions in Spark can result in Double, Double.NaN or Null.
  * This tends to break ?= of the property based testing. Use the nanNullHandler function
  * here to alleviate this by mapping this NaN and Null to None. This will result in
  * functioning comparison again.
  */
object DoubleBehaviourUtils {
  // Mapping with this function is needed because spark uses Double.NaN for some semantics in the
  // correlation function. ?= for prop testing will use == underlying and will break because Double.NaN != Double.NaN
  private val nanHandler: Double => Option[Double] = value => if (!value.equals(Double.NaN)) Option(value) else None
  // Making sure that null => None and does not result in 0.0d because of row.getAs[Double]'s use of .asInstanceOf
  val nanNullHandler: Any => Option[Double] = {
    case null => None
    case d: Double => nanHandler(d)
    case _ => ???
  }
}


================================================
FILE: dataset/src/test/scala/frameless/functions/NonAggregateFunctionsTests.scala
================================================
package frameless
package functions

import java.io.File
import java.util.Base64
import java.nio.charset.StandardCharsets

import frameless.functions.nonAggregate._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.{Column, Encoder, SaveMode, functions => sparkFunctions}
import org.scalacheck.Prop._
import org.scalacheck.{Arbitrary, Gen, Prop}

import scala.annotation.nowarn

class NonAggregateFunctionsTests extends TypedDatasetSuite {
  val testTempFiles = "target/testoutput"

  object NonNegativeGenerators {
    val doubleGen = for {
      s <-  Gen.chooseNum(1, Int.MaxValue)
      e <-  Gen.chooseNum(1, Int.MaxValue)
      res: Double = s.toDouble / e.toDouble
    } yield res

    val intGen:   Gen[Int]   = Gen.chooseNum(1, Int.MaxValue)
    val shortGen: Gen[Short] = Gen.chooseNum(1, Short.MaxValue)
    val longGen:  Gen[Long]  = Gen.chooseNum(1, Long.MaxValue)
    val byteGen:  Gen[Byte]  = Gen.chooseNum(1, Byte.MaxValue)
  }

  object NonNegativeArbitraryNumericValues {
    import NonNegativeGenerators._
    implicit val arbInt:        Arbitrary[Int]        = Arbitrary(intGen)
    implicit val arbDouble:     Arbitrary[Double]     = Arbitrary(doubleGen)
    implicit val arbLong:       Arbitrary[Long]       = Arbitrary(longGen)
    implicit val arbShort:      Arbitrary[Short]      = Arbitrary(shortGen)
    implicit val arbByte:       Arbitrary[Byte]       = Arbitrary(byteGen)
  }

  private val base64Encoder = Base64.getEncoder
  private def base64X1String(x1: X1[String]): X1[String] = {
    def base64(str: String): String = base64Encoder.encodeToString(str.getBytes(StandardCharsets.UTF_8))
    x1.copy(a = base64(x1.a))
  }

  override def afterAll(): Unit = {
    FileUtils.deleteDirectory(new File(testTempFiles))
    super.afterAll()
  }

  test("negate") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder](values: List[X1[A]])(
      implicit encX1:Encoder[X1[A]],
      catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, B]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.negate(cDS("a")))
        .map(_.getAs[B](0))
        .collect()
        .toList

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(negate(col))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Byte, Byte] _))
    check(forAll(prop[Short, Short] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Long, Long]  _))
    check(forAll(prop[BigDecimal, java.math.BigDecimal] _))
  }

  test("not") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[Boolean]], fromBase: Int, toBase: Int)(implicit encX1:Encoder[X1[Boolean]]) = {
      val cDS = session.createDataset(values)

      val resCompare = cDS
        .select(sparkFunctions.not(cDS("a")))
        .map(_.getAs[Boolean](0))
        .collect()
        .toList

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(not(col))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("conv") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[String]], fromBase: Int, toBase: Int)(implicit encX1:Encoder[X1[String]]) = {
      val cDS = session.createDataset(values)

      val resCompare = cDS
        .select(sparkFunctions.conv(cDS("a"), fromBase, toBase))
        .map(_.getAs[String](0))
        .collect()
        .toList

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(conv(col, fromBase, toBase))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("degrees") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.degrees(cDS("a")))
        .map(_.getAs[Double](0))
        .collect()
        .toList

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(degrees(col))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Byte] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Long]  _))
    check(forAll(prop[BigDecimal] _))
  }

  def propBitShift[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]])
    (typedCol: TypedColumn[X1[A], B], sparkFunc: (Column,Int) => Column, numBits: Int): Prop = {
    val spark = session
    import spark.implicits._

    val resCompare = typedDS.dataset
      .select(sparkFunc($"a", numBits))
      .map(_.getAs[B](0))
      .collect()
      .toList

    val res = typedDS
      .select(typedCol)
      .collect()
      .run()
      .toList

    res ?= resCompare
  }

  test("shiftRightUnsigned") {
    val spark = session
    import spark.implicits._

    @nowarn // supress sparkFunctions.shiftRightUnsigned call which is used to maintain Spark 3.1.x backwards compat
    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder]
    (values: List[X1[A]], numBits: Int)
    (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      propBitShift(typedDS)(shiftRightUnsigned(typedDS('a), numBits), sparkFunctions.shiftRightUnsigned, numBits)
    }

    check(forAll(prop[Byte, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[BigDecimal, Int] _))
  }

  test("shiftRight") {
    val spark = session
    import spark.implicits._

    @nowarn // supress sparkFunctions.shiftRight call which is used to maintain Spark 3.1.x backwards compat
    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder]
    (values: List[X1[A]], numBits: Int)
    (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      propBitShift(typedDS)(shiftRight(typedDS('a), numBits), sparkFunctions.shiftRight, numBits)
    }

    check(forAll(prop[Byte, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[BigDecimal, Int] _))
  }

  test("shiftLeft") {
    val spark = session
    import spark.implicits._

    @nowarn // supress sparkFunctions.shiftLeft call which is used to maintain Spark 3.1.x backwards compat
    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder]
    (values: List[X1[A]], numBits: Int)
    (implicit catalystBitShift: CatalystBitShift[A, B], encX1: Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      propBitShift(typedDS)(shiftLeft(typedDS('a), numBits), sparkFunctions.shiftLeft, numBits)
    }

    check(forAll(prop[Byte, Int] _))
    check(forAll(prop[Short, Int] _))
    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[BigDecimal, Int] _))
  }

  test("ceil") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder]
    (values: List[X1[A]])(
      implicit catalystAbsolute: CatalystRound[A, B], encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.ceil(cDS("a")))
        .map(_.getAs[B](0))
        .collect()
        .toList.map{
          case bigDecimal : java.math.BigDecimal => bigDecimal.setScale(0)
          case other => other
        }.asInstanceOf[List[B]]


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(ceil(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int, Long] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[Short, Long] _))
    check(forAll(prop[Double, Long] _))
    check(forAll(prop[BigDecimal, java.math.BigDecimal] _))
  }

  test("sha2") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = {
      Seq(224, 256, 384, 512).map { numBits =>
        val cDS = session.createDataset(values)
        val resCompare = cDS
          .select(sparkFunctions.sha2(cDS("a"), numBits))
          .map(_.getAs[String](0))
          .collect().toList

        val typedDS = TypedDataset.create(values)
        val res = typedDS
          .select(sha2(typedDS('a), numBits))
          .collect()
          .run()
          .toList
        res ?= resCompare
      }.reduce(_ && _)
    }

    check(forAll(prop _))
  }

  test("sha1") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.sha1(cDS("a")))
        .map(_.getAs[String](0))
        .collect().toList

      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(sha1(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("crc32") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[Array[Byte]]])(implicit encX1: Encoder[X1[Array[Byte]]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.crc32(cDS("a")))
        .map(_.getAs[Long](0))
        .collect()
        .toList

      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(crc32(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("floor") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder, B: TypedEncoder : Encoder]
    (values: List[X1[A]])(
      implicit catalystAbsolute: CatalystRound[A, B], encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.floor(cDS("a")))
        .map(_.getAs[B](0))
        .collect()
        .toList.map{
        case bigDecimal : java.math.BigDecimal => bigDecimal.setScale(0)
        case other => other
      }.asInstanceOf[List[B]]


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(floor(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }
    check(forAll(prop[Int, Long] _))
    check(forAll(prop[Long, Long] _))
    check(forAll(prop[Short, Long] _))
    check(forAll(prop[Double, Long] _))
    check(forAll(prop[BigDecimal, java.math.BigDecimal] _))
  }


  test("abs big decimal") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder: Encoder, B: TypedEncoder: Encoder]
      (values: List[X1[A]])
      (
        implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, B],
        encX1:Encoder[X1[A]]
      )= {
        val cDS = session.createDataset(values)
        val resCompare = cDS
          .select(sparkFunctions.abs(cDS("a")))
          .map(_.getAs[B](0))
          .collect().toList

        val typedDS = TypedDataset.create(values)
        val col = typedDS('a)
        val res = typedDS
          .select(
            abs(col)
          )
          .collect()
          .run()
          .toList

        res ?= resCompare
      }

    check(forAll(prop[BigDecimal, java.math.BigDecimal] _))
  }

  test("abs") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder]
    (values: List[X1[A]])
    (
      implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, A],
      encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.abs(cDS("a")))
        .map(_.getAs[A](0))
        .collect().toList


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(abs(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Double] _))
  }

  def propTrigonometric[A: CatalystNumeric: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]])
    (typedCol: TypedColumn[X1[A], Double], sparkFunc: Column => Column): Prop = {
      val spark = session
      import spark.implicits._

      val resCompare = typedDS.dataset
        .select(sparkFunc($"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res = typedDS
        .select(typedCol)
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      res ?= resCompare
  }

  test("cos") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(cos(typedDS('a)), sparkFunctions.cos)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("cosh") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(cosh(typedDS('a)), sparkFunctions.cosh)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("acos") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(acos(typedDS('a)), sparkFunctions.acos)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }


  test("signum") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(signum(typedDS('a)), sparkFunctions.signum)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("sin") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(sin(typedDS('a)), sparkFunctions.sin)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("sinh") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(sinh(typedDS('a)), sparkFunctions.sinh)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("asin") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(asin(typedDS('a)), sparkFunctions.asin)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("tan") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(tan(typedDS('a)), sparkFunctions.tan)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("tanh") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])
      (implicit encX1:Encoder[X1[A]]) = {
        val typedDS = TypedDataset.create(values)
        propTrigonometric(typedDS)(tanh(typedDS('a)), sparkFunctions.tanh)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

   /*
    * Currently not all Collection types play nice with the Encoders.
    * This test needs to be readressed and Set readded to the Collection Typeclass once these issues are resolved.
    *
    * [[https://issues.apache.org/jira/browse/SPARK-18891]]
    * [[https://issues.apache.org/jira/browse/SPARK-21204]]
    */
  test("arrayContains"){
    val spark = session
    import spark.implicits._

    val listLength = 10
    val idxs = Stream.continually(Range(0, listLength)).flatten.toIterator

    abstract class Nth[A, C[A]:CatalystCollection] {

      def nth(c:C[A], idx:Int):A
    }

    implicit def deriveListNth[A] : Nth[A, List] = new Nth[A, List] {
      override def nth(c: List[A], idx: Int): A = c(idx)
    }

    implicit def deriveSeqNth[A] : Nth[A, Seq] = new Nth[A, Seq] {
      override def nth(c: Seq[A], idx: Int): A = c(idx)
    }

    implicit def deriveVectorNth[A] : Nth[A, Vector] = new Nth[A, Vector] {
      override def nth(c: Vector[A], idx: Int): A = c(idx)
    }

    implicit def deriveArrayNth[A] : Nth[A, Array] = new Nth[A, Array] {
      override def nth(c: Array[A], idx: Int): A = c(idx)
    }


    def prop[C[_] : CatalystCollection]
      (
        values: C[Int],
        shouldBeIn:Boolean)
      (
        implicit nth:Nth[Int, C],
        encEv: Encoder[C[Int]],
        tEncEv: TypedEncoder[C[Int]]
      ) = {

      val contained = if (shouldBeIn) nth.nth(values, idxs.next) else -1

      val cDS = session.createDataset(List(values))
      val resCompare = cDS
        .select(sparkFunctions.array_contains(cDS("value"), contained))
        .map(_.getAs[Boolean](0))
        .collect().toList

      val typedDS = TypedDataset.create(List(X1(values)))
      val res = typedDS
        .select(arrayContains(typedDS('a), contained))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(
      forAll(
        Gen.listOfN(listLength, Gen.choose(0,100)),
        Gen.oneOf(true,false)
      )
      (prop[List])
    )

    /*check( Looks like there is no Typed Encoder for Seq type yet
      forAll(
        Gen.listOfN(listLength, Gen.choose(0,100)),
        Gen.oneOf(true,false)
      )
      (prop[Seq])
    )*/

    check(
      forAll(
        Gen.listOfN(listLength, Gen.choose(0,100)).map(_.toVector),
        Gen.oneOf(true,false)
      )
      (prop[Vector])
    )

    check(
      forAll(
        Gen.listOfN(listLength, Gen.choose(0,100)).map(_.toArray),
        Gen.oneOf(true,false)
      )
      (prop[Array])
    )
  }

  test("atan") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder]
    (na: A, values: List[X1[A]])(implicit encX1: Encoder[X1[A]]) = {
      val cDS = session.createDataset(X1(na) :: values)
      val resCompare = cDS
        .select(sparkFunctions.atan(cDS("a")))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val typedDS = TypedDataset.create(cDS)
      val res = typedDS
        .select(atan(typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val aggrTyped = typedDS.agg(atan(
        frameless.functions.aggregate.first(typedDS('a)))
      ).firstOption().run().get

      val aggrSpark = cDS.select(
        sparkFunctions.atan(sparkFunctions.first("a")).as[Double]
      ).first()

      (res ?= resCompare).&&(aggrTyped ?= aggrSpark)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("atan2") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder,
             B: CatalystNumeric : TypedEncoder : Encoder](na: X2[A, B], values: List[X2[A, B]])
            (implicit encEv: Encoder[X2[A,B]]) = {
      val cDS = session.createDataset(na +: values)
      val resCompare = cDS
        .select(sparkFunctions.atan2(cDS("a"), cDS("b")))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList


      val typedDS = TypedDataset.create(cDS)
      val res = typedDS
        .select(atan2(typedDS('a), typedDS('b)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val aggrTyped = typedDS.agg(atan2(
        frameless.functions.aggregate.first(typedDS('a)),
        frameless.functions.aggregate.first(typedDS('b)))
      ).firstOption().run().get

      val aggrSpark = cDS.select(
        sparkFunctions.atan2(sparkFunctions.first("a"),sparkFunctions.first("b")).as[Double]
      ).first()

      (res ?= resCompare).&&(aggrTyped ?= aggrSpark)
    }


    check(forAll(prop[Int, Long] _))
    check(forAll(prop[Long, Int] _))
    check(forAll(prop[Short, Byte] _))
    check(forAll(prop[BigDecimal, Double] _))
    check(forAll(prop[Byte, Int] _))
    check(forAll(prop[Double, Double] _))
  }

  test("atan2LitLeft") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder]
    (na: X1[A], value: List[X1[A]], lit:Double)(implicit encX1:Encoder[X1[A]]) = {
      val cDS = session.createDataset(na +: value)
      val resCompare = cDS
        .select(sparkFunctions.atan2(lit, cDS("a")))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList


      val typedDS = TypedDataset.create(cDS)
      val res = typedDS
        .select(atan2(lit, typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val aggrTyped = typedDS.agg(atan2(
        lit,
        frameless.functions.aggregate.first(typedDS('a)))
      ).firstOption().run().get

      val aggrSpark = cDS.select(
        sparkFunctions.atan2(lit, sparkFunctions.first("a")).as[Double]
      ).first()

      (res ?= resCompare).&&(aggrTyped ?= aggrSpark)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("atan2LitRight") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder]
    (na: X1[A], value: List[X1[A]], lit:Double)(implicit encX1:Encoder[X1[A]]) = {
      val cDS = session.createDataset(na +: value)
      val resCompare = cDS
        .select(sparkFunctions.atan2(cDS("a"), lit))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList


      val typedDS = TypedDataset.create(cDS)
      val res = typedDS
        .select(atan2(typedDS('a), lit))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val aggrTyped = typedDS.agg(atan2(
        frameless.functions.aggregate.first(typedDS('a)),
        lit)
      ).firstOption().run().get

      val aggrSpark = cDS.select(
        sparkFunctions.atan2(sparkFunctions.first("a"), lit).as[Double]
      ).first()

      (res ?= resCompare).&&(aggrTyped ?= aggrSpark)
    }


    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  def mathProp[A: CatalystNumeric: TypedEncoder : Encoder](typedDS: TypedDataset[X1[A]])(
    typedCol: TypedColumn[X1[A], Double], sparkFunc: Column => Column
  ): Prop = {
    val spark = session
    import spark.implicits._

    val resCompare = typedDS.dataset
      .select(sparkFunc($"a"))
      .map(_.getAs[Double](0))
      .map(DoubleBehaviourUtils.nanNullHandler)
      .collect().toList

    val res = typedDS
      .select(typedCol)
      .deserialized
      .map(DoubleBehaviourUtils.nanNullHandler)
      .collect()
      .run()
      .toList

    res ?= resCompare
  }

  test("sqrt") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(sqrt(typedDS('a)), sparkFunctions.sqrt)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("crbt") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(cbrt(typedDS('a)), sparkFunctions.cbrt)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("exp") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(exp(typedDS('a)), sparkFunctions.exp)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[BigDecimal] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("md5") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder](values: List[X1[A]]): Prop = {
      val spark = session
      import spark.implicits._

      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.md5($"a"))
        .map(_.getAs[String](0))
        .collect().toList

      val res = typedDS
        .select(md5(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[String] _))
  }

  test("factorial") {
    val spark = session

    def prop(values: List[X1[Long]]): Prop = {
      val spark = session
      import spark.implicits._

      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.factorial($"a"))
        .map(_.getAs[Long](0))
        .collect().toList

      val res = typedDS
        .select(factorial(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("round") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(
      implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A],
      encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.round(cDS("a")))
        .map(_.getAs[A](0))
        .collect().toList


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(round(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Double] _))
  }

  test("round big decimal") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder: Encoder](values: List[X1[A]])(
      implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal],
      encX1:Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)

      val resCompare = cDS
        .select(sparkFunctions.round(cDS("a")))
        .map(_.getAs[java.math.BigDecimal](0))
        .collect()
        .toList.map(_.setScale(0))

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(round(col))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[BigDecimal] _))
  }

  test("round with scale") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(
      implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A],
      encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.round(cDS("a"), 1))
        .map(_.getAs[A](0))
        .collect().toList


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(round(typedDS('a), 1))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Double] _))
  }

  test("round big decimal with scale") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder: Encoder](values: List[X1[A]])(
      implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal],
      encX1:Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)

      val resCompare = cDS
        .select(sparkFunctions.round(cDS("a"), 0))
        .map(_.getAs[java.math.BigDecimal](0))
        .collect()
        .toList.map(_.setScale(0))

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(round(col, 0))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[BigDecimal] _))
  }

  test("bround") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(
      implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A],
      encX1: Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.bround(cDS("a")))
        .map(_.getAs[A](0))
        .collect().toList


      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(bround(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Double] _))
    }

  test("bround big decimal") {
    val spark = session
    import spark.implicits._

    def prop[A: TypedEncoder: Encoder](values: List[X1[A]])(
      implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal],
      encX1:Encoder[X1[A]]
    ) = {
      val cDS = session.createDataset(values)

      val resCompare = cDS
        .select(sparkFunctions.bround(cDS("a")))
        .map(_.getAs[java.math.BigDecimal](0))
        .collect()
        .toList.map(_.setScale(0))

      val typedDS = TypedDataset.create(values)
      val col = typedDS('a)
      val res = typedDS
        .select(bround(col))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[BigDecimal] _))
  }

    test("bround with scale") {
      val spark = session
      import spark.implicits._

      def prop[A: TypedEncoder : Encoder](values: List[X1[A]])(
        implicit catalystNumericWithJavaBigDecimal: CatalystNumericWithJavaBigDecimal[A, A],
        encX1: Encoder[X1[A]]
      ) = {
          val cDS = session.createDataset(values)
          val resCompare = cDS
            .select(sparkFunctions.bround(cDS("a"), 1))
            .map(_.getAs[A](0))
            .collect().toList


          val typedDS = TypedDataset.create(values)
          val res = typedDS
            .select(bround(typedDS('a), 1))
            .collect()
            .run()
            .toList

          res ?= resCompare
      }

      check(forAll(prop[Int] _))
      check(forAll(prop[Long] _))
      check(forAll(prop[Short] _))
      check(forAll(prop[Double] _))
    }

    test("bround big decimal with scale") {
      val spark = session
      import spark.implicits._

      def prop[A: TypedEncoder: Encoder](values: List[X1[A]])(
        implicit catalystAbsolute: CatalystNumericWithJavaBigDecimal[A, java.math.BigDecimal],
        encX1:Encoder[X1[A]]
      ) = {
          val cDS = session.createDataset(values)

          val resCompare = cDS
            .select(sparkFunctions.bround(cDS("a"), 0))
            .map(_.getAs[java.math.BigDecimal](0))
            .collect()
            .toList.map(_.setScale(0))

          val typedDS = TypedDataset.create(values)
          val col = typedDS('a)
          val res = typedDS
            .select(bround(col, 0))
            .collect()
            .run()
            .toList

          res ?= resCompare
      }

    check(forAll(prop[BigDecimal] _))
  }

  test("log with base") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X1[A]],
      base: Double
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.log(base, $"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res = typedDS
        .select(log(base, typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("log") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(log(typedDS('a)), sparkFunctions.log)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("log2") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(log2(typedDS('a)), sparkFunctions.log2)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("log1p") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(log1p(typedDS('a)), sparkFunctions.log1p)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("log10") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric : TypedEncoder : Encoder](values: List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val typedDS = TypedDataset.create(values)
      mathProp(typedDS)(log10(typedDS('a)), sparkFunctions.log10)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("base64") {
    val spark = session
    import spark.implicits._

    def prop(values:List[X1[Array[Byte]]])(implicit encX1:Encoder[X1[Array[Byte]]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.base64(cDS("a")))
        .map(_.getAs[String](0))
        .collect().toList

      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(base64(typedDS('a)))
        .collect()
        .run()
        .toList

      val backAndForth = typedDS
        .select(base64(unbase64(base64(typedDS('a)))))
        .collect()
        .run()
        .toList

      (res ?= resCompare) && (res ?= backAndForth)
    }

    check(forAll(prop _))
  }

  test("hypot with double") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X1[A]],
      base: Double
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.hypot(base, $"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res2 = typedDS
        .select(hypot(typedDS('a), base))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val res = typedDS
        .select(hypot(base, typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      (res ?= resCompare) && (res2 ?= resCompare)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("hypot with two columns") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X2[A, A]]
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.hypot($"b", $"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res = typedDS
        .select(hypot(typedDS('b), typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("pow with double") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X1[A]],
      base: Double
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.pow(base, $"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res = typedDS
        .select(pow(base, typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      val resCompare2 = typedDS.dataset
        .select(sparkFunctions.pow($"a", base))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res2 = typedDS
        .select(pow(typedDS('a), base))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      (res ?= resCompare) &&  (res2 ?= resCompare2)
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("pow with two columns") {
    val spark = session
    import spark.implicits._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X2[A, A]]
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.pow($"b", $"a"))
        .map(_.getAs[Double](0))
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect().toList

      val res = typedDS
        .select(pow(typedDS('b), typedDS('a)))
        .deserialized
        .map(DoubleBehaviourUtils.nanNullHandler)
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("pmod") {
    val spark = session
    import spark.implicits._
    import NonNegativeArbitraryNumericValues._

    def prop[A: CatalystNumeric: TypedEncoder : Encoder](
      values: List[X2[A, A]]
    ): Prop = {
      val spark = session
      import spark.implicits._
      val typedDS = TypedDataset.create(values)

      val resCompare = typedDS.dataset
        .select(sparkFunctions.pmod($"b", $"a"))
        .map(_.getAs[A](0))
        .collect().toList

      val res = typedDS
        .select(pmod(typedDS('b), typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Double] _))
  }

  test("unbase64") {
    val spark = session
    import spark.implicits._

    def prop(values: List[X1[String]])(implicit encX1: Encoder[X1[String]]) = {
      val valuesBase64 = values.map(base64X1String)
      val cDS = session.createDataset(valuesBase64)
      val resCompare = cDS
        .select(sparkFunctions.unbase64(cDS("a")))
        .map(_.getAs[Array[Byte]](0))
        .collect().toList

      val typedDS = TypedDataset.create(valuesBase64)
      val res = typedDS
        .select(unbase64(typedDS('a)))
        .collect()
        .run()
        .toList

      res.map(_.toList) ?= resCompare.map(_.toList)
    }

    check(forAll(prop _))
  }

  test("bin"){
    val spark = session
    import spark.implicits._

    def prop(values:List[X1[Long]])(implicit encX1:Encoder[X1[Long]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.bin(cDS("a")))
        .map(_.getAs[String](0))
        .collect().toList

      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(bin(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop _))
  }

  test("bitwiseNOT"){
    val spark = session
    import spark.implicits._

    @nowarn // supress sparkFunctions.bitwiseNOT call which is used to maintain Spark 3.1.x backwards compat
    def prop[A: CatalystBitwise : TypedEncoder : Encoder]
    (values:List[X1[A]])(implicit encX1:Encoder[X1[A]]) = {
      val cDS = session.createDataset(values)
      val resCompare = cDS
        .select(sparkFunctions.bitwiseNOT(cDS("a")))
        .map(_.getAs[A](0))
        .collect().toList

      val typedDS = TypedDataset.create(values)
      val res = typedDS
        .select(bitwiseNOT(typedDS('a)))
        .collect()
        .run()
        .toList

      res ?= resCompare
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Int] _))
  }

  test("inputFileName") {
    val spark = session
    import spark.implicits._

    def prop[A : TypedEncoder](
      toFile1: List[X1[A]],
      toFile2: List[X1[A]],
      inMem: List[X1[A]]
    )(implicit x2Gen: Encoder[X2[A, String]], x3Gen: Encoder[X3[A, String, String]]) = {

      val file1Path = testTempFiles + "/file1"
      val file2Path = testTempFiles + "/file2"

      val toFile1WithName = toFile1.map(x => X2(x.a, "file1"))
      val toFile2WithName = toFile2.map(x => X2(x.a, "file2"))
      val inMemWithName = inMem.map(x => X2(x.a, ""))

      toFile1WithName.toDS().write.mode(SaveMode.Overwrite).parquet(file1Path)
      toFile2WithName.toDS().write.mode(SaveMode.Overwrite).parquet(file2Path)

      val readBackIn1 = spark.read.parquet(file1Path).as[X2[A, String]]
      val readBackIn2 = spark.read.parquet(file2Path).as[X2[A, String]]

      val ds1 = TypedDataset.create(readBackIn1)
      val ds2 = TypedDataset.create(readBackIn2)
      val ds3 = TypedDataset.create(inMemWithName)

      val unioned = ds1.union(ds2).union(ds3)

      val withFileName = unioned.withColumn[X3[A, String, String]](inputFileName[X2[A, String]]())
        .collect()
        .run()
        .toVector

      val grouped = withFileName.groupBy(_.b).mapValues(_.map(_.c).toSet)

      grouped.foldLeft(passed) { (p, g) =>
        p && secure { g._1 match {
          case "" => g._2.head == "" //Empty string if didn't come from file
          case f => g._2.forall(_.contains(f))
        }}}
    }

    check(forAll(prop[String] _))
  }

  test("monotonic id") {
    val spark = session
    import spark.implicits._

    def prop[A : TypedEncoder](xs: List[X1[A]])(implicit x2en: Encoder[X2[A, Long]]) = {
      val ds = TypedDataset.create(xs)

      val result = ds.withColumn[X2[A, Long]](monotonicallyIncreasingId())
        .collect()
        .run()
        .toVector

      val ids = result.map(_.b)
      (ids.toSet.size ?= ids.length) &&
        (ids.sorted ?= ids)
    }

    check(forAll(prop[String] _))
  }

  test("when") {
    val spark = session
    import spark.implicits._

    def prop[A : TypedEncoder : Encoder]
    (condition1: Boolean, condition2: Boolean, value1: A, value2: A, otherwise: A) = {
      val ds = TypedDataset.create(X5(condition1, condition2, value1, value2, otherwise) :: Nil)

      val untypedWhen = ds.toDF()
        .select(
          sparkFunctions.when(sparkFunctions.col("a"), sparkFunctions.col("c"))
            .when(sparkFunctions.col("b"), sparkFunctions.col("d"))
            .otherwise(sparkFunctions.col("e"))
        )
        .as[A]
        .collect()
        .toList

      val typedWhen = ds
        .select(
          when(ds('a), ds('c))
            .when(ds('b), ds('d))
            .otherwise(ds('e))
        )
        .collect()
        .run()
        .toList

      typedWhen ?= untypedWhen
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Short] _))
    check(forAll(prop[Byte] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Option[Int]] _))
  }

  test("ascii") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.ascii($"a"))
        .map(_.getAs[Int](0))
        .collect()
        .toVector

      val typed = ds
        .select(ascii(ds('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("concat") {
    val spark = session
    import spark.implicits._

    val pairs = for {
      y <- Gen.alphaStr
      x <- Gen.nonEmptyListOf(X2(y, y))
    } yield x

    check(forAll(pairs) { values: List[X2[String, String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.concat($"a", $"b"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(concat(ds('a), ds('b)))
        .collect()
        .run()
        .toVector

      (typed ?= sparkResult).&&(typed ?= values.map(x => s"${x.a}${x.b}").toVector)
    })
  }

  test("concat for TypedAggregate") {
    val spark = session
    import frameless.functions.aggregate._
    import spark.implicits._
    val pairs = for {
      y <- Gen.alphaStr
      x <- Gen.nonEmptyListOf(X2(y, y))
    } yield x

    check(forAll(pairs) { values: List[X2[String, String]] =>
      val ds = TypedDataset.create(values)
      val td = ds.agg(concat(first(ds('a)),first(ds('b)))).collect().run().toVector
      val spark = ds.dataset.select(sparkFunctions.concat(
        sparkFunctions.first($"a").as[String],
        sparkFunctions.first($"b").as[String])).as[String].collect().toVector
      td ?= spark
    })
  }

  test("concat_ws") {
    val spark = session
    import spark.implicits._

    val pairs = for {
      y <- Gen.alphaStr
      x <- Gen.nonEmptyListOf(X2(y, y))
    } yield x

    check(forAll(pairs) { values: List[X2[String, String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.concat_ws(",", $"a", $"b"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(concatWs(",", ds('a), ds('b)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("concat_ws for TypedAggregate") {
    val spark = session
    import frameless.functions.aggregate._
    import spark.implicits._
    val pairs = for {
      y <- Gen.alphaStr
      x <- Gen.listOfN(10, X2(y, y))
    } yield x

    check(forAll(pairs) { values: List[X2[String, String]] =>
      val ds = TypedDataset.create(values)
      val td = ds.agg(concatWs(",",first(ds('a)),first(ds('b)), last(ds('b)))).collect().run().toVector
      val spark = ds.dataset.select(sparkFunctions.concat_ws(",",
        sparkFunctions.first($"a").as[String],
        sparkFunctions.first($"b").as[String],
        sparkFunctions.last($"b").as[String])).as[String].collect().toVector
      td ?= spark
    })
  }

  test("instr") {
    val spark = session
    import spark.implicits._
    check(forAll(Gen.nonEmptyListOf(Gen.alphaStr)) { values: List[String] =>
      val ds = TypedDataset.create(values.map(x => X1(x + values.head)))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.instr($"a", values.head))
        .map(_.getAs[Int](0))
        .collect()
        .toVector

      val typed = ds
        .select(instr(ds('a), values.head))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("length") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.length($"a"))
        .map(_.getAs[Int](0))
        .collect()
        .toVector

      val typed = ds
        .select(length(ds[String]('a)))
        .collect()
        .run()
        .toVector

      (typed ?= sparkResult).&&(values.map(_.a.length).toVector ?= typed)
    })
  }

  test("levenshtein") {
    val spark = session
    import spark.implicits._
    check(forAll { (na: X1[String], values: List[X1[String]]) =>
      val ds = TypedDataset.create(na +: values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.levenshtein($"a", sparkFunctions.concat($"a",sparkFunctions.lit("Hello"))))
        .map(_.getAs[Int](0))
        .collect()
        .toVector

      val typed = ds
        .select(levenshtein(ds('a), concat(ds('a),lit("Hello"))))
        .collect()
        .run()
        .toVector

      val cDS = ds.dataset
      val aggrTyped = ds.agg(
        levenshtein(frameless.functions.aggregate.first(ds('a)), litAggr("Hello"))
      ).firstOption().run().get

      val aggrSpark = cDS.select(
        sparkFunctions.levenshtein(sparkFunctions.first("a"), sparkFunctions.lit("Hello")).as[Int]
      ).first()

      (typed ?= sparkResult).&&(aggrTyped ?= aggrSpark)
    })
  }

  test("regexp_replace") {
    val spark = session
    import spark.implicits._
    check(forAll { (values: List[X1[String]], n: Int) =>
      val ds = TypedDataset.create(values.map(x => X1(s"$n${x.a}-$n$n")))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.regexp_replace($"a", "\\d+", "n"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(regexpReplace(ds[String]('a), "\\d+".r, "n"))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("reverse") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.reverse($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(reverse(ds[String]('a)))
        .collect()
        .run()
        .toVector

      (typed ?= sparkResult).&&(values.map(_.a.reverse).toVector ?= typed)
    })
  }

  test("rpad") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.rpad($"a", 5, "hello"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(rpad(ds[String]('a), 5, "hello"))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("lpad") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.lpad($"a", 5, "hello"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(lpad(ds[String]('a), 5, "hello"))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("rtrim") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values.map(x => X1(s"  ${x.a}    ")))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.rtrim($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(rtrim(ds[String]('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("ltrim") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values.map(x => X1(s"  ${x.a}    ")))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.ltrim($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(ltrim(ds[String]('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("substring") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values)

      val sparkResult = ds.toDF()
        .select(sparkFunctions.substring($"a", 5, 3))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(substring(ds[String]('a), 5, 3))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("trim") {
    val spark = session
    import spark.implicits._
    check(forAll { values: List[X1[String]] =>
      val ds = TypedDataset.create(values.map(x => X1(s"  ${x.a}    ")))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.trim($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(trim(ds[String]('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("upper") {
    val spark = session
    import spark.implicits._
    check(forAll(Gen.listOf(Gen.alphaStr)) { values: List[String] =>
      val ds = TypedDataset.create(values.map(X1(_)))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.upper($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(upper(ds[String]('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("lower") {
    val spark = session
    import spark.implicits._
    check(forAll(Gen.listOf(Gen.alphaStr)) { values: List[String] =>
      val ds = TypedDataset.create(values.map(X1(_)))

      val sparkResult = ds.toDF()
        .select(sparkFunctions.lower($"a"))
        .map(_.getAs[String](0))
        .collect()
        .toVector

      val typed = ds
        .select(lower(ds[String]('a)))
        .collect()
        .run()
        .toVector

      typed ?= sparkResult
    })
  }

  test("Empty vararg tests") {
    def prop[A : TypedEncoder, B: TypedEncoder](data: Vector[X2[A, B]]) = {
      val ds = TypedDataset.create(data)
      val frameless = ds.select(ds('a), concat(), ds('b), concatWs(":")).collect().run().toVector
      val framelessAggr = ds.agg(concat(), concatWs("x"), litAggr(2)).collect().run().toVector
      val scala = data.map(x => (x.a, "", x.b, ""))
      val scalaAggr = Vector(("", "", 2))
      (frameless ?= scala).&&(framelessAggr ?= scalaAggr)
    }

    check(forAll(prop[Long, Long] _))
    check(forAll(prop[Option[Boolean], Long] _))
  }

  def dateTimeStringProp(typedDS: TypedDataset[X1[String]])
                        (typedCol: TypedColumn[X1[String], Option[Int]], sparkFunc: Column => Column): Prop = {
    val spark = session
    import spark.implicits._

    val sparkResult = typedDS.dataset
      .select(sparkFunc($"a"))
      .map(DateTimeStringBehaviourUtils.nullHandler)
      .collect()
      .toList

    val typed = typedDS
      .select(typedCol)
      .collect()
      .run()
      .toList

    typed ?= sparkResult
  }

  test("year") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
        val ds = TypedDataset.create(data)
        dateTimeStringProp(ds)(year(ds[String]('a)), sparkFunctions.year)
      }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("quarter") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(quarter(ds[String]('a)), sparkFunctions.quarter)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("month") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(month(ds[String]('a)), sparkFunctions.month)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("dayofweek") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(dayofweek(ds[String]('a)), sparkFunctions.dayofweek)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("dayofmonth") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(dayofmonth(ds[String]('a)), sparkFunctions.dayofmonth)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("dayofyear") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(dayofyear(ds[String]('a)), sparkFunctions.dayofyear)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("hour") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(hour(ds[String]('a)), sparkFunctions.hour)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("minute") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(minute(ds[String]('a)), sparkFunctions.minute)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("second") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(second(ds[String]('a)), sparkFunctions.second)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }

  test("weekofyear") {
    val spark = session
    import spark.implicits._

    def prop(data: List[X1[String]])(implicit E: Encoder[Option[Int]]): Prop = {
      val ds = TypedDataset.create(data)
      dateTimeStringProp(ds)(weekofyear(ds[String]('a)), sparkFunctions.weekofyear)
    }

    check(forAll(dateTimeStringGen)(data => prop(data.map(X1.apply))))
    check(forAll(prop _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/functions/UdfTests.scala
================================================
package frameless
package functions

import org.scalacheck.Prop
import org.scalacheck.Prop._

class UdfTests extends TypedDatasetSuite {

  test("one argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder](data: Vector[X1[A]], f1: A => B): Prop = {
      val dataset: TypedDataset[X1[A]] = TypedDataset.create(data)
      val u1 = udf[X1[A], A, B](f1)
      val u2 = dataset.makeUDF(f1)
      val A = dataset.col[A]('a)

      // filter forces whole codegen
      val codegen = dataset.deserialized.filter((_:X1[A]) => true).select(u1(A)).collect().run().toVector

      // otherwise it uses local relation
      val local = dataset.select(u2(A)).collect().run().toVector

      val d = data.map(x => f1(x.a))

      (codegen ?= d) && (local ?= d)
    }

    check(forAll(prop[Int, Int] _))
    check(forAll(prop[String, String] _))
    check(forAll(prop[Option[Int], Option[Int]] _))
    check(forAll(prop[X1[Int], X1[Int]] _))
    check(forAll(prop[X1[Option[Int]], X1[Option[Int]]] _))

    // TODO doesn't work for the same reason as `collect`
    // check(forAll(prop[X1[Option[X1[Int]]], X1[Option[X1[Option[Int]]]]] _))

    check(forAll(prop[Option[Vector[String]], Option[Vector[String]]] _))

    def prop2[A: TypedEncoder, B: TypedEncoder](f: A => B)(a: A): Prop = prop(Vector(X1(a)), f)

    check(forAll(prop2[Int, Option[Int]](x => if (x % 2 == 0) Some(x) else None) _))
    check(forAll(prop2[Option[Int], Int](x => x getOrElse 0) _))
  }

  test("multiple one argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder]
    (data: Vector[X3[A, B, C]], f1: A => A, f2: B => B, f3: C => C): Prop = {
      val dataset = TypedDataset.create(data)
      val u11 = udf[X3[A, B, C], A, A](f1)
      val u21 = udf[X3[A, B, C], B, B](f2)
      val u31 = udf[X3[A, B, C], C, C](f3)
      val u12 = dataset.makeUDF(f1)
      val u22 = dataset.makeUDF(f2)
      val u32 = dataset.makeUDF(f3)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val dataset21 = dataset.select(u11(A), u21(B), u31(C)).collect().run().toVector
      val dataset22 = dataset.select(u12(A), u22(B), u32(C)).collect().run().toVector
      val d = data.map(x => (f1(x.a), f2(x.b), f3(x.c)))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int] _))
    check(forAll(prop[String, Int, Int] _))
    check(forAll(prop[X3[Int, String, Boolean], Int, Int] _))
    check(forAll(prop[X3U[Int, String, Boolean], Int, Int] _))
  }

  test("two argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder]
    (data: Vector[X3[A, B, C]], f1: (A, B) => C): Prop = {
      val dataset = TypedDataset.create(data)
      val u1 = udf[X3[A, B, C], A, B, C](f1)
      val u2 = dataset.makeUDF(f1)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val dataset21 = dataset.select(u1(A, B)).collect().run().toVector
      val dataset22 = dataset.select(u2(A, B)).collect().run().toVector
      val d = data.map(x => f1(x.a, x.b))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int] _))
    check(forAll(prop[String, Int, Int] _))
  }

  test("multiple two argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder]
    (data: Vector[X3[A, B, C]], f1: (A, B) => C, f2: (B, C) => A): Prop = {
      val dataset = TypedDataset.create(data)
      val u11 = udf[X3[A, B, C], A, B, C](f1)
      val u12 = dataset.makeUDF(f1)
      val u21 = udf[X3[A, B, C], B, C, A](f2)
      val u22 = dataset.makeUDF(f2)

      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val dataset21 = dataset.select(u11(A, B), u21(B, C)).collect().run().toVector
      val dataset22 = dataset.select(u12(A, B), u22(B, C)).collect().run().toVector
      val d = data.map(x => (f1(x.a, x.b), f2(x.b, x.c)))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int] _))
    check(forAll(prop[String, Int, Int] _))
  }

  test("three argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder]
    (data: Vector[X3[A, B, C]], f: (A, B, C) => C): Prop = {
      val dataset = TypedDataset.create(data)
      val u1 = udf[X3[A, B, C], A, B, C, C](f)
      val u2 = dataset.makeUDF(f)

      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val dataset21 = dataset.select(u1(A, B, C)).collect().run().toVector
      val dataset22 = dataset.select(u2(A, B, C)).collect().run().toVector
      val d = data.map(x => f(x.a, x.b, x.c))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int] _))
    check(forAll(prop[String, Int, Int] _))
  }

  test("four argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder]
    (data: Vector[X4[A, B, C, D]], f: (A, B, C, D) => C): Prop = {
      val dataset = TypedDataset.create(data)
      val u1 = udf[X4[A, B, C, D], A, B, C, D, C](f)
      val u2 = dataset.makeUDF(f)

      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)

      val dataset21 = dataset.select(u1(A, B, C, D)).collect().run().toVector
      val dataset22 = dataset.select(u2(A, B, C, D)).collect().run().toVector
      val d = data.map(x => f(x.a, x.b, x.c, x.d))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int, Int] _))
    check(forAll(prop[String, Int, Int, String] _))
    check(forAll(prop[String, String, String, String] _))
    check(forAll(prop[String, Long, String, String] _))
    check(forAll(prop[String, Boolean, Boolean, String] _))
  }

  test("five argument udf") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder, E: TypedEncoder]
    (data: Vector[X5[A, B, C, D, E]], f: (A, B, C, D, E) => C): Prop = {
      val dataset = TypedDataset.create(data)
      val u1 = udf[X5[A, B, C, D, E], A, B, C, D, E, C](f)
      val u2 = dataset.makeUDF(f)

      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)
      val E = dataset.col[E]('e)

      val dataset21 = dataset.select(u1(A, B, C, D, E)).collect().run().toVector
      val dataset22 = dataset.select(u2(A, B, C, D, E)).collect().run().toVector
      val d = data.map(x => f(x.a, x.b, x.c, x.d, x.e))

      (dataset21 ?= d) && (dataset22 ?= d)
    }

    check(forAll(prop[Int, Int, Int, Int, Int] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/functions/UnaryFunctionsTest.scala
================================================
package frameless
package functions

import org.scalacheck.{ Arbitrary, Prop }
import org.scalacheck.Prop._
import scala.collection.SeqLike

import scala.math.Ordering
import scala.reflect.ClassTag

class UnaryFunctionsTest extends TypedDatasetSuite {
  test("size tests") {
    def prop[F[X] <: Traversable[X] : CatalystSizableCollection, A](xs: List[X1[F[A]]])(implicit arb: Arbitrary[F[A]], enc: TypedEncoder[F[A]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(size(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.size).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Vector, Long] _))
    check(forAll(prop[List, Long] _))
    check(forAll(prop[Vector, Char] _))
    check(forAll(prop[List, Char] _))
    check(forAll(prop[Vector, X2[Int, Option[Long]]] _))
    check(forAll(prop[List, X2[Int, Option[Long]]] _))
  }

  test("size on array test") {
    def prop[A: TypedEncoder: ClassTag](xs: List[X1[Array[A]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(size(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.size).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[String] _))
    check(forAll(prop[X2[Int, Option[Long]]] _))
  }

  test("size on Map") {
    def prop[A](xs: List[X1[Map[A, A]]])(implicit arb: Arbitrary[Map[A, A]], enc: TypedEncoder[Map[A, A]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(size(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.size).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[Char] _))
  }

  test("sort in ascending order") {
    def prop[F[X] <: SeqLike[X, F[X]] : CatalystSortableCollection, A: Ordering](xs: List[X1[F[A]]])(implicit enc: TypedEncoder[F[A]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(sortAscending(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.sorted).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Vector, Long] _))
    check(forAll(prop[Vector, Int] _))
    check(forAll(prop[Vector, Char] _))
    check(forAll(prop[Vector, String] _))
    check(forAll(prop[List, Long] _))
    check(forAll(prop[List, Int] _))
    check(forAll(prop[List, Char] _))
    check(forAll(prop[List, String] _))
  }

  test("sort in descending order") {
    def prop[F[X] <: SeqLike[X, F[X]] : CatalystSortableCollection, A: Ordering](xs: List[X1[F[A]]])(implicit enc: TypedEncoder[F[A]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(sortDescending(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.sorted.reverse).toVector

      framelessResults ?= scalaResults
    }

    check(forAll(prop[Vector, Long] _))
    check(forAll(prop[Vector, Int] _))
    check(forAll(prop[Vector, Char] _))
    check(forAll(prop[Vector, String] _))
    check(forAll(prop[List, Long] _))
    check(forAll(prop[List, Int] _))
    check(forAll(prop[List, Char] _))
    check(forAll(prop[List, String] _))
  }

  test("sort on array test: ascending order") {
    def prop[A: TypedEncoder : Ordering : ClassTag](xs: List[X1[Array[A]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(sortAscending(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.sorted).toVector

      Prop {
        framelessResults
          .zip(scalaResults)
          .forall {
            case (a, b) => a sameElements b
          }
      }
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }

  test("sort on array test: descending order") {
    def prop[A: TypedEncoder : Ordering : ClassTag](xs: List[X1[Array[A]]]): Prop = {
      val tds = TypedDataset.create(xs)

      val framelessResults = tds.select(sortDescending(tds('a))).collect().run().toVector
      val scalaResults = xs.map(x => x.a.sorted.reverse).toVector

      Prop {
        framelessResults
          .zip(scalaResults)
          .forall {
            case (a, b) => a sameElements b
          }
      }
    }

    check(forAll(prop[Long] _))
    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/ColumnTypesTest.scala
================================================
package frameless
package ops

import org.scalacheck.Prop
import org.scalacheck.Prop.forAll
import shapeless.HNil
import shapeless.::

class ColumnTypesTest extends TypedDatasetSuite {
  test("test summoning") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = {
      val d: TypedDataset[X4[A, B, C, D]] = TypedDataset.create(data)
      val hlist = d('a) :: d('b) :: d('c) :: d('d) :: HNil

      type TC[N] = TypedColumn[X4[A,B,C,D], N]

      type IN = TC[A] :: TC[B] :: TC[C] :: TC[D] :: HNil
      type OUT = A :: B :: C :: D :: HNil

      implicitly[ColumnTypes.Aux[X4[A,B,C,D], IN, OUT]]
      Prop.passed // successful compilation implies test correctness
    }

    check(forAll(prop[Int, String, X1[String], Boolean] _))
    check(forAll(prop[Vector[Int], Vector[Vector[String]], X1[String], Option[String]] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/CubeTests.scala
================================================
package frameless
package ops

import frameless.functions.aggregate._
import org.scalacheck.Prop
import org.scalacheck.Prop._

class CubeTests extends TypedDatasetSuite {

  test("cube('a).agg(count())") {
    def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric]
    (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.cube(A).agg(count()).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.cube("a").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }

  test("cube('a, 'b).agg(count())") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric]
    (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val received = dataset.cube(A, B).agg(count()).collect().run().toVector.sortBy(_._3)
      val expected = dataset.dataset.cube("a", "b").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[Long](2))).sortBy(_._3)

      received ?= expected
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("cube('a).agg(sum('b)") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric]
    (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val received = dataset.cube(A).agg(sum(B)).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.cube("a").sum("b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Out](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("cube('a).mapGroups('a, sum('b))") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder : Numeric]
    (data: List[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.cube(A)
        .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) }
        .collect().run().toVector.sortBy(_._1)
      val expected = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }

  test("cube('a).agg(sum('b), sum('c)) to cube('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder,
    C: TypedEncoder,
    OutB: TypedEncoder : Numeric,
    OutC: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]])(
      implicit
      summableB: CatalystSummable[B, OutB],
      summableC: CatalystSummable[C, OutC]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumBC = dataset
        .cube(A)
        .agg(sum(B), sum(C))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBC = dataset.dataset.cube("a").sum("b", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2)))
        .sortBy(_._1)

      val framelessSumBCB = dataset
        .cube(A)
        .agg(sum(B), sum(C), sum(B))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCB = dataset.dataset.cube("a").sum("b", "c", "b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3)))
        .sortBy(_._1)

      val framelessSumBCBC = dataset
        .cube(A)
        .agg(sum(B), sum(C), sum(B), sum(C))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCBC = dataset.dataset.cube("a").sum("b", "c", "b", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4)))
        .sortBy(_._1)

      val framelessSumBCBCB = dataset
        .cube(A)
        .agg(sum(B), sum(C), sum(B), sum(C), sum(B))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCBCB = dataset.dataset.cube("a").sum("b", "c", "b", "c", "b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4), row.getAs[OutB](5)))
        .sortBy(_._1)

      (framelessSumBC ?= sparkSumBC)
        .&&(framelessSumBCB ?= sparkSumBCB)
        .&&(framelessSumBCBC ?= sparkSumBCBC)
        .&&(framelessSumBCBCB ?= sparkSumBCBCB)
    }

    check(forAll(prop[String, Long, Double, Long, Double] _))
  }

  test("cube('a, 'b).agg(sum('c), sum('d))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder,
    D: TypedEncoder,
    OutC: TypedEncoder : Numeric,
    OutD: TypedEncoder : Numeric
    ](data: List[X4[A, B, C, D]])(
      implicit
      summableC: CatalystSummable[C, OutC],
      summableD: CatalystSummable[D, OutD]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)

      val framelessSumByAB = dataset
        .cube(A, B)
        .agg(sum(C), sum(D))
        .collect().run().toVector.sortBy(x => (x._1, x._2))

      val sparkSumByAB = dataset.dataset
        .cube("a", "b").sum("c", "d").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutD](3)))
        .sortBy(x => (x._1, x._2))

      framelessSumByAB ?= sparkSumByAB
    }

    check(forAll(prop[Byte, Int, Long, Double, Long, Double] _))
  }

  test("cube('a, 'b).agg(sum('c)) to cube('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder,
    OutC: TypedEncoder: Numeric
    ](data: List[X3[A, B, C]])(implicit summableC: CatalystSummable[C, OutC]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumC = dataset
        .cube(A, B)
        .agg(sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumC = dataset.dataset
        .cube("a", "b").sum("c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2)))
        .sortBy(_._2)

      val framelessSumCC = dataset
        .cube(A, B)
        .agg(sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCC = dataset.dataset
        .cube("a", "b").sum("c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3)))
        .sortBy(_._2)

      val framelessSumCCC = dataset
        .cube(A, B)
        .agg(sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCC = dataset.dataset
        .cube("a", "b").sum("c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4)))
        .sortBy(_._2)

      val framelessSumCCCC = dataset
        .cube(A, B)
        .agg(sum(C), sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCCC = dataset.dataset
        .cube("a", "b").sum("c", "c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5)))
        .sortBy(_._2)

      val framelessSumCCCCC = dataset
        .cube(A, B)
        .agg(sum(C), sum(C), sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCCCC = dataset.dataset
        .cube("a", "b").sum("c", "c", "c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5), row.getAs[OutC](6)))
        .sortBy(_._2)

      (framelessSumC ?= sparkSumC) &&
        (framelessSumCC ?= sparkSumCC) &&
        (framelessSumCCC ?= sparkSumCCC) &&
        (framelessSumCCCC ?= sparkSumCCCC) &&
        (framelessSumCCCCC ?= sparkSumCCCCC)
    }

    check(forAll(prop[String, Long, Double, Double] _))
  }

  test("cube('a, 'b).mapGroups('a, 'b, sum('c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val framelessSumByAB = dataset
        .cube(A, B)
        .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) }
        .collect().run().toVector.sortBy(x => (x._1, x._2))

      val sumByAB = data.groupBy(x => (x.a, x.b))
        .mapValues { xs => xs.map(_.c).sum }
        .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2))

      framelessSumByAB ?= sumByAB
    }

    check(forAll(prop[Byte, Int, Long] _))
  }

  test("cube('a).mapGroups(('a, toVector(('a, 'b))") {
    def prop[
    A: TypedEncoder: Ordering,
    B: TypedEncoder: Ordering,
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .cube(A)
        .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted))
        .collect().run().toMap

      val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted }

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("cube('a).flatMapGroups(('a, toVector(('a, 'b))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .cube(A)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run()
        .sorted

      val dataGrouped = data
        .groupBy(_.a).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("cube('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder : Ordering
    ](data: Vector[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val cA = dataset.col[A]('a)
      val cB = dataset.col[B]('b)

      val datasetGrouped = dataset
        .cube(cA, cB)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run()
        .sorted

      val dataGrouped = data
        .groupBy(t => (t.a, t.b)).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short], Long] _))
    check(forAll(prop[Option[Short], Short, Int] _))
    check(forAll(prop[X1[Option[Short]], Short, Byte] _))
  }

  test("cubeMany('a).agg(sum('b))") {
    def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric]
    (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.cubeMany(A).agg(count[X1[A]]()).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.cube("a").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/ops/PivotTest.scala
================================================
package frameless
package ops

import frameless.functions.aggregate._
import org.apache.spark.sql.{functions => sparkFunctions}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Prop._
import org.scalacheck.{Gen, Prop}

class PivotTest extends TypedDatasetSuite {
  def withCustomGenX4: Gen[Vector[X4[String, String, Int, Boolean]]] = {
    val kvPairGen: Gen[X4[String, String, Int, Boolean]] = for {
      a <- Gen.oneOf(Seq("1", "2", "3", "4"))
      b <- Gen.oneOf(Seq("a", "b", "c"))
      c <- arbitrary[Int]
      d <- arbitrary[Boolean]
    } yield X4(a, b, c, d)

    Gen.listOfN(4, kvPairGen).map(_.toVector)
  }

  test("X4[Boolean, String, Int, Boolean] pivot on String") {
    def prop(data: Vector[X4[String, String, Int, Boolean]]): Prop = {
      val d = TypedDataset.create(data)
      val frameless = d.groupBy(d('a)).
        pivot(d('b)).on("a", "b", "c").
        agg(sum(d('c)), first(d('d))).collect().run().toVector

      val spark = d.dataset.groupBy("a")
        .pivot("b", Seq("a", "b", "c"))
        .agg(sparkFunctions.sum("c"), sparkFunctions.first("d")).collect().toVector

      (frameless.map(_._1) ?= spark.map(x => x.getAs[String](0))).&&(
        frameless.map(_._2) ?= spark.map(x => Option(x.getAs[Long](1)))).&&(
        frameless.map(_._3) ?= spark.map(x => Option(x.getAs[Boolean](2)))).&&(
        frameless.map(_._4) ?= spark.map(x => Option(x.getAs[Long](3)))).&&(
        frameless.map(_._5) ?= spark.map(x => Option(x.getAs[Boolean](4)))).&&(
        frameless.map(_._6) ?= spark.map(x => Option(x.getAs[Long](5)))).&&(
        frameless.map(_._7) ?= spark.map(x => Option(x.getAs[Boolean](6))))
    }

    check(forAll(withCustomGenX4)(prop))
  }

  test("Pivot on Boolean") {
    val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false))
    val d = TypedDataset.create(x)
    d.groupByMany(d('a)).
      pivot(d('c)).on(true, false).
      agg(count[X3[String, Boolean, Boolean]]()).
      collect().run().toVector ?= Vector(("a", Some(2L), Some(1L))) // two true one false
  }

  test("Pivot with groupBy on two columns, pivot on Long") {
    val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20))
    val d = TypedDataset.create(x)
    d.groupBy(d('a), d('b)).
      pivot(d('c)).on(1L, 20L).
      agg(count[X3[String, String, Long]]()).
      collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L)))
  }

  test("Pivot with cube on two columns, pivot on Long") {
    val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20))
    val d = TypedDataset.create(x)
    d.cube(d('a), d('b))
      .pivot(d('c)).on(1L, 20L)
      .agg(count[X3[String, String, Long]]())
      .collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L)))
  }

  test("Pivot with cube on Boolean") {
    val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false))
    val d = TypedDataset.create(x)
    d.cube(d('a)).
      pivot(d('c)).on(true, false).
      agg(count[X3[String, Boolean, Boolean]]()).
      collect().run().toVector ?= Vector(("a", Some(2L), Some(1L)))
  }

  test("Pivot with rollup on two columns, pivot on Long") {
    val x: Seq[X3[String, String, Long]] = Seq(X3("a", "x", 1), X3("a", "x", 1), X3("a", "c", 20))
    val d = TypedDataset.create(x)
    d.rollup(d('a), d('b))
      .pivot(d('c)).on(1L, 20L)
      .agg(count[X3[String, String, Long]]())
      .collect().run().toSet ?= Set(("a", "x", Some(2L), None), ("a", "c", None, Some(1L)))
  }

  test("Pivot with rollup on Boolean") {
    val x: Seq[X3[String, Boolean, Boolean]] = Seq(X3("a", true, true), X3("a", true, true), X3("a", true, false))
    val d = TypedDataset.create(x)
    d.rollupMany(d('a)).
      pivot(d('c)).on(true, false).
      agg(count[X3[String, Boolean, Boolean]]()).
      collect().run().toVector ?= Vector(("a", Some(2L), Some(1L)))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/ops/RepeatTest.scala
================================================
package frameless
package ops

import shapeless.test.illTyped
import shapeless.{::, HNil, Nat}

class RepeatTest extends TypedDatasetSuite {
  test("summoning with implicitly") {
    implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._1, Int::Boolean::HNil]]
    implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._2, Int::Boolean::Int::Boolean::HNil]]
    implicitly[Repeat.Aux[Int::Boolean::HNil, Nat._3, Int::Boolean::Int::Boolean::Int::Boolean::HNil]]
    implicitly[Repeat.Aux[String::HNil, Nat._5, String::String::String::String::String::HNil]]
  }

  test("ill typed") {
    illTyped("""implicitly[Repeat.Aux[String::HNil, Nat._5, String::String::String::String::HNil]]""")
  }
}

================================================
FILE: dataset/src/test/scala/frameless/ops/RollupTests.scala
================================================
package frameless
package ops

import frameless.functions.aggregate._
import org.scalacheck.Prop
import org.scalacheck.Prop._

class RollupTests extends TypedDatasetSuite {

  test("rollup('a).agg(count())") {
    def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric]
    (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.rollup(A).agg(count()).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.rollup("a").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }

  test("rollup('a, 'b).agg(count())") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric]
    (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val received = dataset.rollup(A, B).agg(count()).collect().run().toVector.sortBy(_._3)
      val expected = dataset.dataset.rollup("a", "b").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[Long](2))).sortBy(_._3)

      received ?= expected
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("rollup('a).agg(sum('b)") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder, Out: TypedEncoder : Numeric]
    (data: List[X2[A, B]])(implicit summable: CatalystSummable[B, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val received = dataset.rollup(A).agg(sum(B)).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.rollup("a").sum("b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Out](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long, Long] _))
  }

  test("rollup('a).mapGroups('a, sum('b))") {
    def prop[A: TypedEncoder : Ordering, B: TypedEncoder : Numeric]
    (data: List[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.rollup(A)
        .deserialized.mapGroups { case (a, xs) => (a, xs.map(_.b).sum) }
        .collect().run().toVector.sortBy(_._1)
      val expected = data.groupBy(_.a).mapValues(_.map(_.b).sum).toVector.sortBy(_._1)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }

  test("rollup('a).agg(sum('b), sum('c)) to rollup('a).agg(sum('a), sum('b), sum('a), sum('b), sum('a))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder,
    C: TypedEncoder,
    OutB: TypedEncoder : Numeric,
    OutC: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]])(
      implicit
      summableB: CatalystSummable[B, OutB],
      summableC: CatalystSummable[C, OutC]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumBC = dataset
        .rollup(A)
        .agg(sum(B), sum(C))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBC = dataset.dataset.rollup("a").sum("b", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2)))
        .sortBy(_._1)

      val framelessSumBCB = dataset
        .rollup(A)
        .agg(sum(B), sum(C), sum(B))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCB = dataset.dataset.rollup("a").sum("b", "c", "b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3)))
        .sortBy(_._1)

      val framelessSumBCBC = dataset
        .rollup(A)
        .agg(sum(B), sum(C), sum(B), sum(C))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCBC = dataset.dataset.rollup("a").sum("b", "c", "b", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4)))
        .sortBy(_._1)

      val framelessSumBCBCB = dataset
        .rollup(A)
        .agg(sum(B), sum(C), sum(B), sum(C), sum(B))
        .collect().run().toVector.sortBy(_._1)

      val sparkSumBCBCB = dataset.dataset.rollup("a").sum("b", "c", "b", "c", "b").collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[OutB](1), row.getAs[OutC](2), row.getAs[OutB](3), row.getAs[OutC](4), row.getAs[OutB](5)))
        .sortBy(_._1)

      (framelessSumBC ?= sparkSumBC)
        .&&(framelessSumBCB ?= sparkSumBCB)
        .&&(framelessSumBCBC ?= sparkSumBCBC)
        .&&(framelessSumBCBCB ?= sparkSumBCBCB)
    }

    check(forAll(prop[String, Long, Double, Long, Double] _))
  }

  test("rollup('a, 'b).agg(sum('c), sum('d))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder,
    D: TypedEncoder,
    OutC: TypedEncoder : Numeric,
    OutD: TypedEncoder : Numeric
    ](data: List[X4[A, B, C, D]])(
      implicit
      summableC: CatalystSummable[C, OutC],
      summableD: CatalystSummable[D, OutD]
    ): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)
      val D = dataset.col[D]('d)

      val framelessSumByAB = dataset
        .rollup(A, B)
        .agg(sum(C), sum(D))
        .collect().run().toVector.sortBy(_._2)

      val sparkSumByAB = dataset.dataset
        .rollup("a", "b").sum("c", "d").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutD](3)))
        .sortBy(_._2)

      framelessSumByAB ?= sparkSumByAB
    }

    check(forAll(prop[Byte, Int, Long, Double, Long, Double] _))
  }

  test("rollup('a, 'b).agg(sum('c)) to rollup('a, 'b).agg(sum('c),sum('c),sum('c),sum('c),sum('c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder,
    OutC: TypedEncoder: Numeric
    ](data: List[X3[A, B, C]])(implicit summableC: CatalystSummable[C, OutC]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)
      val C = dataset.col[C]('c)

      val framelessSumC = dataset
        .rollup(A, B)
        .agg(sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumC = dataset.dataset
        .rollup("a", "b").sum("c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2)))
        .sortBy(_._2)

      val framelessSumCC = dataset
        .rollup(A, B)
        .agg(sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCC = dataset.dataset
        .rollup("a", "b").sum("c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3)))
        .sortBy(_._2)

      val framelessSumCCC = dataset
        .rollup(A, B)
        .agg(sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCC = dataset.dataset
        .rollup("a", "b").sum("c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4)))
        .sortBy(_._2)

      val framelessSumCCCC = dataset
        .rollup(A, B)
        .agg(sum(C), sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCCC = dataset.dataset
        .rollup("a", "b").sum("c", "c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5)))
        .sortBy(_._2)

      val framelessSumCCCCC = dataset
        .rollup(A, B)
        .agg(sum(C), sum(C), sum(C), sum(C), sum(C))
        .collect().run().toVector
        .sortBy(_._2)

      val sparkSumCCCCC = dataset.dataset
        .rollup("a", "b").sum("c", "c", "c", "c", "c").collect().toVector
        .map(row => (Option(row.getAs[A](0)), Option(row.getAs[B](1)), row.getAs[OutC](2), row.getAs[OutC](3), row.getAs[OutC](4), row.getAs[OutC](5), row.getAs[OutC](6)))
        .sortBy(_._2)

      (framelessSumC ?= sparkSumC) &&
        (framelessSumCC ?= sparkSumCC) &&
        (framelessSumCCC ?= sparkSumCCC) &&
        (framelessSumCCCC ?= sparkSumCCCC) &&
        (framelessSumCCCCC ?= sparkSumCCCCC)
    }

    check(forAll(prop[String, Long, Double, Double] _))
  }

  test("rollup('a, 'b).mapGroups('a, 'b, sum('c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder : Numeric
    ](data: List[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)
      val B = dataset.col[B]('b)

      val framelessSumByAB = dataset
        .rollup(A, B)
        .deserialized.mapGroups { case ((a, b), xs) => (a, b, xs.map(_.c).sum) }
        .collect().run().toVector.sortBy(x => (x._1, x._2))

      val sumByAB = data.groupBy(x => (x.a, x.b))
        .mapValues { xs => xs.map(_.c).sum }
        .toVector.map { case ((a, b), c) => (a, b, c) }.sortBy(x => (x._1, x._2))

      framelessSumByAB ?= sumByAB
    }

    check(forAll(prop[Byte, Int, Long] _))
  }

  test("rollup('a).mapGroups(('a, toVector(('a, 'b))") {
    def prop[
    A: TypedEncoder: Ordering,
    B: TypedEncoder: Ordering
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .rollup(A)
        .deserialized.mapGroups((a, xs) => (a, xs.toVector.sorted))
        .collect().run().toMap

      val dataGrouped = data.groupBy(_.a).map { case (k, v) => k -> v.sorted }

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("rollup('a).flatMapGroups(('a, toVector(('a, 'b))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering
    ](data: Vector[X2[A, B]]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val datasetGrouped = dataset
        .rollup(A)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run()
        .sorted

      val dataGrouped = data
        .groupBy(_.a).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short]] _))
    check(forAll(prop[Option[Short], Short] _))
    check(forAll(prop[X1[Option[Short]], Short] _))
  }

  test("rollup('a, 'b).flatMapGroups((('a,'b) toVector((('a,'b), 'c))") {
    def prop[
    A: TypedEncoder : Ordering,
    B: TypedEncoder : Ordering,
    C: TypedEncoder : Ordering
    ](data: Vector[X3[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)
      val cA = dataset.col[A]('a)
      val cB = dataset.col[B]('b)

      val datasetGrouped = dataset
        .rollup(cA, cB)
        .deserialized.flatMapGroups((a, xs) => xs.map(x => (a, x)))
        .collect().run()
        .sorted

      val dataGrouped = data
        .groupBy(t => (t.a, t.b)).toSeq
        .flatMap { case (a, xs) => xs.map(x => (a, x)) }
        .sorted

      datasetGrouped ?= dataGrouped
    }

    check(forAll(prop[Short, Option[Short], Long] _))
    check(forAll(prop[Option[Short], Short, Int] _))
    check(forAll(prop[X1[Option[Short]], Short, Byte] _))
  }

  test("rollupMany('a).agg(sum('b))") {
    def prop[A: TypedEncoder : Ordering, Out: TypedEncoder : Numeric]
    (data: List[X1[A]])(implicit summable: CatalystSummable[A, Out]): Prop = {
      val dataset = TypedDataset.create(data)
      val A = dataset.col[A]('a)

      val received = dataset.rollupMany(A).agg(count[X1[A]]()).collect().run().toVector.sortBy(_._2)
      val expected = dataset.dataset.rollup("a").count().collect().toVector
        .map(row => (Option(row.getAs[A](0)), row.getAs[Long](1))).sortBy(_._2)

      received ?= expected
    }

    check(forAll(prop[Int, Long] _))
  }
}

================================================
FILE: dataset/src/test/scala/frameless/ops/SmartProjectTest.scala
================================================
package frameless
package ops

import org.scalacheck.Prop
import org.scalacheck.Prop._
import shapeless.test.illTyped


case class Foo(i: Int, j: Int, x: String)
case class Bar(i: Int, x: String)
case class InvalidFooProjectionType(i: Int, x: Boolean)
case class InvalidFooProjectionName(i: Int, xerr: String)

class SmartProjectTest extends TypedDatasetSuite {
  // Lazy needed to prevent initialization anterior to the `beforeAll` hook
  lazy val dataset = TypedDataset.create(Foo(1, 2, "hi") :: Foo(2, 3, "there") :: Nil)

  test("project Foo to Bar") {
    assert(dataset.project[Bar].count().run() === 2)
  }

  test("project to InvalidFooProjection should not type check") {
    illTyped("dataset.project[InvalidFooProjectionType]")
    illTyped("dataset.project[InvalidFooProjectionName]")
  }

  test("X4 to X1,X2,X3,X4 projections") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder, D: TypedEncoder](data: Vector[X4[A, B, C, D]]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.project[X4[A, B, C, D]].collect().run().toVector ?= data
      dataset.project[X3[A, B, C]].collect().run().toVector ?= data.map(x => X3(x.a, x.b, x.c))
      dataset.project[X2[A, B]].collect().run().toVector ?= data.map(x => X2(x.a, x.b))
      dataset.project[X1[A]].collect().run().toVector ?= data.map(x => X1(x.a))
    }

    check(forAll(prop[Int, String, X1[String], Boolean] _))
    check(forAll(prop[Short, Long, String, Boolean] _))
    check(forAll(prop[Short, (Boolean, Boolean), String, (Int, Int)] _))
    check(forAll(prop[X2[String, Boolean], (Boolean, Boolean), String, Boolean] _))
    check(forAll(prop[X2[String, Boolean], X3[Boolean, Boolean, Long], String, String] _))
  }

  test("X3U to X1,X2,X3 projections") {
    def prop[A: TypedEncoder, B: TypedEncoder, C: TypedEncoder](data: Vector[X3U[A, B, C]]): Prop = {
      val dataset = TypedDataset.create(data)

      dataset.project[X3[A, B, C]].collect().run().toVector ?= data.map(x => X3(x.a, x.b, x.c))
      dataset.project[X2[A, B]].collect().run().toVector ?= data.map(x => X2(x.a, x.b))
      dataset.project[X1[A]].collect().run().toVector ?= data.map(x => X1(x.a))
    }

    check(forAll(prop[Int, String, X1[String]] _))
    check(forAll(prop[Short, Long, String] _))
    check(forAll(prop[Short, (Boolean, Boolean), String] _))
    check(forAll(prop[X2[String, Boolean], (Boolean, Boolean), String] _))
    check(forAll(prop[X2[String, Boolean], X3[Boolean, Boolean, Long], String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/deserialized/FilterTests.scala
================================================
package frameless
package ops
package deserialized

import org.scalacheck.Prop
import org.scalacheck.Prop._

class FilterTests extends TypedDatasetSuite {
  test("filter") {
    def prop[A: TypedEncoder](filterFunction: A => Boolean, data: Vector[A]): Prop =
      TypedDataset.create(data).
        deserialized.
        filter(filterFunction).
        collect().run().toVector =? data.filter(filterFunction)

    check(forAll(prop[Int] _))
    check(forAll(prop[String] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/deserialized/FlatMapTests.scala
================================================
package frameless
package ops
package deserialized

import org.scalacheck.Prop
import org.scalacheck.Prop._

class FlatMapTests extends TypedDatasetSuite {
  test("flatMap") {
    def prop[A: TypedEncoder, B: TypedEncoder](flatMapFunction: A => Vector[B], data: Vector[A]): Prop =
      TypedDataset.create(data).
        deserialized.
        flatMap(flatMapFunction).
        collect().run().toVector =? data.flatMap(flatMapFunction)

    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Int, String] _))
    check(forAll(prop[String, Int] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/deserialized/MapPartitionsTests.scala
================================================
package frameless
package ops
package deserialized

import org.scalacheck.Prop
import org.scalacheck.Prop._

class MapPartitionsTests extends TypedDatasetSuite {
  test("mapPartitions") {
    def prop[A: TypedEncoder, B: TypedEncoder](mapFunction: A => B, data: Vector[A]): Prop = {
      val lifted: Iterator[A] => Iterator[B] = _.map(mapFunction)
      TypedDataset.create(data).
        deserialized.
        mapPartitions(lifted).
        collect().run().toVector =? data.map(mapFunction)
    }

    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Int, String] _))
    check(forAll(prop[String, Int] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/deserialized/MapTests.scala
================================================
package frameless
package ops
package deserialized

import org.scalacheck.Prop
import org.scalacheck.Prop._

class MapTests extends TypedDatasetSuite {
  test("map") {
    def prop[A: TypedEncoder, B: TypedEncoder](mapFunction: A => B, data: Vector[A]): Prop =
      TypedDataset.create(data).
        deserialized.
        map(mapFunction).
        collect().run().toVector =? data.map(mapFunction)

    check(forAll(prop[Int, Int] _))
    check(forAll(prop[Int, String] _))
    check(forAll(prop[String, Int] _))
    check(forAll(prop[X1[Int], X1[Int]] _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/ops/deserialized/ReduceTests.scala
================================================
package frameless
package ops
package deserialized

import org.scalacheck.Prop
import org.scalacheck.Prop._

class ReduceTests extends TypedDatasetSuite {
  def prop[A: TypedEncoder](reduceFunction: (A, A) => A)(data: Vector[A]): Prop =
    TypedDataset.create(data).
      deserialized.
      reduceOption(reduceFunction).run() =? data.reduceOption(reduceFunction)

  test("reduce Int") {
    check(forAll(prop[Int](_ + _) _))
    check(forAll(prop[Int](_ * _) _))
  }

  test("reduce String") {
    def reduce(s1: String, s2: String): String = (s1 ++ s2).sorted
    check(forAll(prop[String](reduce) _))
  }
}


================================================
FILE: dataset/src/test/scala/frameless/package.scala
================================================
import java.time.format.DateTimeFormatter
import java.time.{LocalDateTime => JavaLocalDateTime}

import org.scalacheck.{Arbitrary, Gen}

package object frameless {
  /** Fixed decimal point to avoid precision problems specific to Spark */
  implicit val arbBigDecimal: Arbitrary[BigDecimal] = Arbitrary {
    for {
      x <- Gen.chooseNum(-1000, 1000)
      y <- Gen.chooseNum(0, 1000000)
    } yield BigDecimal(s"$x.$y")
  }

  /** Fixed decimal point to avoid precision problems specific to Spark */
  implicit val arbDouble: Arbitrary[Double] = Arbitrary {
    arbBigDecimal.arbitrary.map(_.toDouble)
  }

  implicit val arbSqlDate = Arbitrary {
    Arbitrary.arbitrary[Int].map(SQLDate)
  }

  implicit val arbSqlTimestamp = Arbitrary {
    Arbitrary.arbitrary[Long].map(SQLTimestamp)
  }

  implicit def arbTuple1[A: Arbitrary] = Arbitrary {
    Arbitrary.arbitrary[A].map(Tuple1(_))
  }

  // see issue with scalacheck non serializable Vector: https://github.com/rickynils/scalacheck/issues/315
  implicit def arbVector[A](implicit A: Arbitrary[A]): Arbitrary[Vector[A]] =
    Arbitrary(Gen.listOf(A.arbitrary).map(_.toVector))

  def vectorGen[A: Arbitrary]: Gen[Vector[A]] = arbVector[A].arbitrary

  implicit val arbUdtEncodedClass: Arbitrary[UdtEncodedClass] = Arbitrary {
    for {
      int <- Arbitrary.arbitrary[Int]
      doubles <- Gen.listOf(arbDouble.arbitrary)
    } yield new UdtEncodedClass(int, doubles.toArray)
  }

  val dateTimeFormatter: DateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm")

  implicit val localDateArb: Arbitrary[JavaLocalDateTime] = Arbitrary {
    for {
      year <- Gen.chooseNum(1900, 2027)
      month <- Gen.chooseNum(1, 12)
      dayOfMonth <- Gen.chooseNum(1, 28)
      hour <- Gen.chooseNum(1, 23)
      minute <- Gen.chooseNum(1, 59)
    } yield JavaLocalDateTime.of(year, month, dayOfMonth, hour, minute)
  }

  /** LocalDateTime String Generator to test time related Spark functions */
  val dateTimeStringGen: Gen[List[String]] =
    for {
      listOfDates <- Gen.listOf(localDateArb.arbitrary)
      localDate <- listOfDates
    } yield localDate.format(dateTimeFormatter)

  val TEST_OUTPUT_DIR = "target/test-output"

  /**
   * Will dive down causes until either the cause is true or there are no more causes
   * @param t
   * @param f
   * @return
   */
  def anyCauseHas(t: Throwable, f: Throwable => Boolean): Boolean =
    if (f(t))
      true
    else
      if (t.getCause ne null)
        anyCauseHas(t.getCause, f)
      else
        false

  /**
   * Runs up to maxRuns and outputs the number of failures (times thrown)
   * @param maxRuns
   * @param thunk
   * @tparam T
   * @return the last passing thunk, or null
   */
  def runLoads[T](maxRuns: Int = 1000)(thunk: => T): T ={
    var i = 0
    var r = null.asInstanceOf[T]
    var passed = 0
    while(i < maxRuns){
      i += 1
      try {
        r = thunk
        passed += 1
        if (i % 20 == 0) {
          println(s"run $i successful")
        }
      } catch {
        case t: Throwable => System.err.println(s"failed unexpectedly on run $i - ${t.getMessage}")
      }
    }
    if (passed != maxRuns) {
      System.err.println(s"had ${maxRuns - passed} failures out of $maxRuns runs")
    }
    r
  }

    /**
   * Runs a given thunk up to maxRuns times, restarting the thunk if tolerantOf the thrown Throwable is true
   * @param tolerantOf
   * @param maxRuns default of 20
   * @param thunk
   * @return either a successful run result or the last error will be thrown
   */
  def tolerantRun[T](tolerantOf: Throwable => Boolean, maxRuns: Int = 20)(thunk: => T): T ={
    var passed = false
    var i = 0
    var res: T = null.asInstanceOf[T]
    var thrown: Throwable = null

    while((i < maxRuns) && !passed) {
      try {
        i += 1
        res = thunk
        passed = true
      } catch {
        case t: Throwable if anyCauseHas(t, tolerantOf) =>
          // rinse and repeat
          thrown = t
        case t: Throwable =>
          throw t
      }
    }
    if (!passed) {
      System.err.println(s"Despite being tolerant each of the $maxRuns runs failed, re-throwing the last")
      throw thrown
    }
    res
  }
}


================================================
FILE: dataset/src/test/scala/frameless/sql/package.scala
================================================
package frameless

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.{And, Or}

package object sql {
  implicit class ExpressionOps(val self: Expression) extends AnyVal {
    def toList: List[Expression] = {
      def rec(expr: Expression, acc: List[Expression]): List[Expression] = {
        expr match {
          case And(left, right) => rec(left, rec(right, acc))
          case Or(left, right) => rec(left, rec(right, acc))
          case e => e +: acc
        }
      }

      rec(self, Nil)
    }
  }
}


================================================
FILE: dataset/src/test/scala/frameless/sql/rules/SQLRulesSuite.scala
================================================
package frameless.sql.rules

import frameless._
import frameless.sql._
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec
import org.scalatest.Assertion
import org.scalatest.matchers.should.Matchers

trait SQLRulesSuite extends TypedDatasetSuite with Matchers { self =>
  protected lazy val path: String = {
    val tmpDir = System.getProperty("java.io.tmpdir")
    s"$tmpDir/${self.getClass.getName}"
  }

  def withDataset[A: TypedEncoder: CatalystOrdered](payload: A)(f: TypedDataset[A] => Assertion): Assertion = {
    TypedDataset.create(Seq(payload)).write.mode("overwrite").parquet(path)
    f(TypedDataset.createUnsafe[A](session.read.parquet(path)))
  }

  def predicatePushDownTest[A: TypedEncoder: CatalystOrdered](
    expected: X1[A],
    expectedPushDownFilters: List[Filter],
    planShouldNotContain: PartialFunction[Expression, Expression],
    op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean]
  ): Assertion = {
    withDataset(expected) { dataset =>
      val ds = dataset.filter(op(dataset('a)))
      val actualPushDownFilters = pushDownFilters(ds)

      val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList)

      // check the optimized plan
      optimizedPlan.collectFirst(planShouldNotContain) should be (empty)

      // compare filters
      actualPushDownFilters shouldBe expectedPushDownFilters

      val actual = ds.collect().run().toVector.headOption

      // ensure serialization is not broken
      actual should be(Some(expected))
    }
  }

  protected def pushDownFilters[T](ds: TypedDataset[T]): List[Filter] = {
    val sparkPlan = ds.queryExecution.executedPlan

    val initialPlan =
      if (sparkPlan.children.isEmpty) // assume it's AQE
        sparkPlan match {
          case aq: AdaptiveSparkPlanExec => aq.initialPlan
          case _ => sparkPlan
        }
      else
        sparkPlan

    initialPlan.collect {
      case fs: FileSourceScanExec =>
        import scala.reflect.runtime.{universe => ru}

        val runtimeMirror = ru.runtimeMirror(getClass.getClassLoader)
        val instanceMirror = runtimeMirror.reflect(fs)
        val getter = ru.typeOf[FileSourceScanExec].member(ru.TermName("pushedDownFilters")).asTerm.getter
        val m = instanceMirror.reflectMethod(getter.asMethod)
        val res = m.apply(fs).asInstanceOf[Seq[Filter]]

        res
    }.flatten.toList
  }
}


================================================
FILE: dataset/src/test/scala/frameless/syntax/FramelessSyntaxTests.scala
================================================
package frameless
package syntax

import org.scalacheck.Prop
import org.scalacheck.Prop._
import frameless.functions.aggregate._

class FramelessSyntaxTests extends TypedDatasetSuite {
  // Hide the implicit SparkDelay[Job] on TypedDatasetSuite to avoid ambiguous implicits
  override val sparkDelay = null

  def prop[A, B](data: Vector[X2[A, B]])(
    implicit ev: TypedEncoder[X2[A, B]]
  ): Prop = {
    val dataset = TypedDataset.create(data).dataset
    val dataframe = dataset.toDF()

    val typedDataset = dataset.typed
    val typedDatasetFromDataFrame = dataframe.unsafeTyped[X2[A, B]]

    typedDataset.collect().run().toVector ?= typedDatasetFromDataFrame.collect().run().toVector
  }

  test("dataset typed - toTyped") {
    def prop[A, B](data: Vector[X2[A, B]])(
      implicit ev: TypedEncoder[X2[A, B]]
    ): Prop = {
      val dataset = session.createDataset(data)(TypedExpressionEncoder(ev)).typed
      val dataframe = dataset.toDF()

      dataset.collect().run().toVector ?= dataframe.unsafeTyped[X2[A, B]].collect().run().toVector
    }

    check(forAll(prop[Int, String] _))
    check(forAll(prop[X1[Long], String] _))
  }

  test("frameless typed column and aggregate") {
    def prop[A: TypedEncoder](a: A, b: A): Prop = {
      val d = TypedDataset.create((a, b) :: Nil)
      (d.select(d('_1).untyped.typedColumn).collect().run ?= d.select(d('_1)).collect().run).&&(
        d.agg(first(d('_1))).collect().run() ?= d.agg(first(d('_1)).untyped.typedAggregate).collect().run()
      )
    }

    check(forAll(prop[Int] _))
    check(forAll(prop[X1[Long]] _))
  }
}


================================================
FILE: dataset/src/test/scala/org/apache/hadoop/fs/local/StreamingFS.scala
================================================
package org.apache.hadoop.fs.local

import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem
import org.apache.hadoop.fs.DelegateToFileSystem

class StreamingFS(uri: java.net.URI, conf: org.apache.hadoop.conf.Configuration) extends
  DelegateToFileSystem(uri, new BareLocalFileSystem(), conf, "file", false) {}


================================================
FILE: dataset/src/test/spark-3.2/frameless/sql/rules/FramelessLitPushDownTests.scala
================================================
package frameless.sql.rules

import frameless._
import frameless.sql._
import frameless.functions.Lit
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant}
import org.apache.spark.sql.sources.{Filter, IsNotNull}
import org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericRowWithSchema}
import java.time.Instant

import org.apache.spark.sql.catalyst.plans.logical
import org.scalatest.Assertion

//Note as InvokeLike and "ConditionalExpression" don't have SPARK-40380 and SPARK-39106 no predicate pushdowns can happen in 3.2.4
class FramelessLitPushDownTests extends SQLRulesSuite {
  private val now: Long = currentTimestamp()

  test("java.sql.Timestamp push-down") {
    val expected = java.sql.Timestamp.from(microsToInstant(now))
    val expectedStructure = X1(SQLTimestamp(now))
    val expectedPushDownFilters = List(IsNotNull("a"))

    predicatePushDownTest[SQLTimestamp](
      expectedStructure,
      expectedPushDownFilters,
      { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e },
      _ >= expectedStructure.a
    )
  }

  test("java.time.Instant push-down") {
    val expected = java.sql.Timestamp.from(microsToInstant(now))
    val expectedStructure = X1(microsToInstant(now))
    val expectedPushDownFilters = List(IsNotNull("a"))

    predicatePushDownTest[Instant](
      expectedStructure,
      expectedPushDownFilters,
      { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e },
      _ >= expectedStructure.a
    )
  }

  test("struct push-down") {
    type Payload = X4[Int, Int, Int, Int]
    val expectedStructure = X1(X4(1, 2, 3, 4))
    val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema)
    val expectedPushDownFilters = List(IsNotNull("a"))

    predicatePushDownTest[Payload](
      expectedStructure,
      expectedPushDownFilters,
      // Cast not Lit because of SPARK-40380
      { case e @ expressions.EqualTo(_, _: Cast) => e },
      _ === expectedStructure.a
    )
  }

  override def predicatePushDownTest[A: TypedEncoder: CatalystOrdered](
    expected: X1[A],
    expectedPushDownFilters: List[Filter],
    planShouldContain: PartialFunction[Expression, Expression],
    op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean]
  ): Assertion = {
    withDataset(expected) { dataset =>
      val ds = dataset.filter(op(dataset('a)))
      val actualPushDownFilters = pushDownFilters(ds)

      val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList)

      // check the optimized plan
      optimizedPlan.collectFirst(planShouldContain) should not be (empty)

      // compare filters
      actualPushDownFilters shouldBe expectedPushDownFilters

      val actual = ds.collect().run().toVector.headOption

      // ensure serialization is not broken
      actual should be(Some(expected))
    }
  }

}


================================================
FILE: dataset/src/test/spark-3.3+/frameless/sql/rules/FramelessLitPushDownTests.scala
================================================
package frameless.sql.rules

import frameless._
import frameless.functions.Lit
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant}
import org.apache.spark.sql.sources.{EqualTo, GreaterThanOrEqual, IsNotNull}
import org.apache.spark.sql.catalyst.expressions
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import java.time.Instant

class FramelessLitPushDownTests extends SQLRulesSuite {
  private val now: Long = currentTimestamp()

  test("java.sql.Timestamp push-down") {
    val expected = java.sql.Timestamp.from(microsToInstant(now))
    val expectedStructure = X1(SQLTimestamp(now))
    val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected))

    predicatePushDownTest[SQLTimestamp](
      expectedStructure,
      expectedPushDownFilters,
      { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e },
      _ >= expectedStructure.a
    )
  }

  test("java.time.Instant push-down") {
    val expected = java.sql.Timestamp.from(microsToInstant(now))
    val expectedStructure = X1(microsToInstant(now))
    val expectedPushDownFilters = List(IsNotNull("a"), GreaterThanOrEqual("a", expected))

    predicatePushDownTest[Instant](
      expectedStructure,
      expectedPushDownFilters,
      { case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e },
      _ >= expectedStructure.a
    )
  }

  test("struct push-down") {
    type Payload = X4[Int, Int, Int, Int]
    val expectedStructure = X1(X4(1, 2, 3, 4))
    val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema)
    val expectedPushDownFilters = List(IsNotNull("a"), EqualTo("a", expected))

    predicatePushDownTest[Payload](
      expectedStructure,
      expectedPushDownFilters,
      { case e @ expressions.EqualTo(_, _: Lit[_]) => e },
      _ === expectedStructure.a
    )
  }
}


================================================
FILE: docs/Cats.md
================================================
# Using Cats with Frameless

```scala mdoc:invisible
import org.apache.spark.{SparkConf, SparkContext => SC}
import org.apache.spark.sql.SparkSession
import org.apache.spark.rdd.RDD

val conf: SparkConf = new SparkConf().setMaster("local[4]").setAppName("cats.bec test")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
val sc: SC = spark.sparkContext

spark.sparkContext.setLogLevel("WARN")
System.clearProperty("spark.master.port")
System.clearProperty("spark.driver.port")
System.clearProperty("spark.hostPort")
System.setProperty("spark.cleaner.ttl", "300")

import spark.implicits._

import cats.syntax.all._
import cats.effect.{IO, Sync}
import cats.data.ReaderT
```

There are two main parts to the `cats` integration offered by Frameless:
- effect suspension in `TypedDataset` using `cats-effect` and `cats-mtl`
- `RDD` enhancements using algebraic typeclasses in `cats-kernel`

All the examples below assume you have previously imported `cats.implicits` and `frameless.cats.implicits`.

*Note that you should not import `frameless.syntax._` together with `frameless.cats.implicits._`.*

```scala mdoc
import cats.syntax.all._
import frameless.cats.implicits._
```

## Effect Suspension in typed datasets

As noted in the section about `Job`, all operations on `TypedDataset` are lazy. The results of
operations that would normally block on plain Spark APIs are wrapped in a type constructor `F[_]`,
for which there exists an instance of `SparkDelay[F]`. This typeclass represents the operation of
delaying a computation and capturing an implicit `SparkSession`.

In the `cats` module, we utilize the typeclasses from `cats-effect` for abstracting over these
effect types - namely, we provide an implicit `SparkDelay` instance for all `F[_]` for which exists
an instance of `cats.effect.Sync[F]`.

This allows one to run operations on `TypedDataset` in an existing monad stack. For example, given
this pre-existing monad stack:
```scala mdoc
import frameless.TypedDataset
import cats.data.ReaderT
import cats.effect.IO
import cats.effect.implicits._

type Action[T] = ReaderT[IO, SparkSession, T]
```

We will be able to request that values from `TypedDataset` will be suspended in this stack:
```scala mdoc
val typedDs = TypedDataset.create(Seq((1, "string"), (2, "another")))
val result: Action[(Seq[(Int, String)], Long)] = for {
  sample <- typedDs.take[Action](1)
  count <- typedDs.count[Action]()
} yield (sample, count)
```

As with `Job`, note that nothing has been run yet. The effect has been properly suspended. To
run our program, we must first supply the `SparkSession` to the `ReaderT` layer and then
run the `IO` effect:
```scala mdoc
import cats.effect.unsafe.implicits.global

result.run(spark).unsafeRunSync()
```

### Convenience methods for modifying Spark thread-local variables

The `frameless.cats.implicits._` import also provides some syntax enrichments for any monad
stack that has the same capabilities as `Action` above. Namely, the ability to provide an
instance of `SparkSession` and the ability to suspend effects.

For these to work, we will need to import the implicit machinery from the `cats-mtl` library:
```scala mdoc
import cats.mtl.implicits._
```

And now, we can set the description for the computation being run:
```scala mdoc
val resultWithDescription: Action[(Seq[(Int, String)], Long)] = for {
  r <- result.withDescription("fancy cats")
  session <- ReaderT.ask[IO, SparkSession]
  _ <- ReaderT.liftF {
         IO {
           println(s"Description: ${session.sparkContext.getLocalProperty("spark.job.description")}")
         }
       }
} yield r

resultWithDescription.run(spark).unsafeRunSync()
```

## Using algebraic typeclasses from Cats with RDDs

Data aggregation is one of the most important operations when working with Spark (and data in general).
For example, we often have to compute the `min`, `max`, `avg`, etc. from a set of columns grouped by
different predicates. This section shows how **cats** simplifies these tasks in Spark by
leveraging a large collection of Type Classes for ordering and aggregating data.


Cats offers ways to sort and aggregate tuples of arbitrary arity.

```scala mdoc
import frameless.cats.implicits._

val data: RDD[(Int, Int, Int)] = sc.makeRDD((1, 2, 3) :: (1, 5, 3) :: (8, 2, 3) :: Nil)

println(data.csum)
println(data.cmax)
println(data.cmin)
```

In case the RDD is empty, the `csum`, `cmax` and `cmin` will use the default values for the type of
elements inside the RDD. There are counterpart operations to those that have an `Option` return type
to deal with the case of an empty RDD:

```scala mdoc:nest
val data: RDD[(Int, Int, Int)] = sc.emptyRDD

println(data.csum)
println(data.csumOption)
println(data.cmax)
println(data.cmaxOption)
println(data.cmin)
println(data.cminOption)
```

The following example aggregates all the elements with a common key.

```scala mdoc
type User = String
type TransactionCount = Int

val allData: RDD[(User,TransactionCount)] =
   sc.makeRDD(("Bob", 12) :: ("Joe", 1) :: ("Anna", 100) :: ("Bob", 20) :: ("Joe", 2) :: Nil)

val totalPerUser =  allData.csumByKey

totalPerUser.collectAsMap
```

The same example would work for more complex keys.

```scala mdoc
import scala.collection.immutable.SortedMap

val allDataComplexKeu =
   sc.makeRDD( ("Bob", SortedMap("task1" -> 10)) ::
    ("Joe", SortedMap("task1" -> 1, "task2" -> 3)) :: ("Bob", SortedMap("task1" -> 10, "task2" -> 1)) :: ("Joe", SortedMap("task3" -> 4)) :: Nil )

val overalTasksPerUser = allDataComplexKeu.csumByKey

overalTasksPerUser.collectAsMap
```

#### Joins

```scala mdoc
// Type aliases for meaningful types
type TimeSeries = Map[Int,Int]
type UserName = String
```

Example: Using the implicit full-outer-join operator

```scala mdoc
import frameless.cats.outer._

val day1: RDD[(UserName,TimeSeries)] = sc.makeRDD( ("John", Map(0 -> 2, 1 -> 4)) :: ("Chris", Map(0 -> 1, 1 -> 2)) :: ("Sam", Map(0 -> 1)) :: Nil )
val day2: RDD[(UserName,TimeSeries)] = sc.makeRDD( ("John", Map(0 -> 10, 1 -> 11)) :: ("Chris", Map(0 -> 1, 1 -> 2)) :: ("Joe", Map(0 -> 1, 1 -> 2)) :: Nil )

val daysCombined = day1 |+| day2

daysCombined.collect()
```

Note how the user's timeseries from different days have been aggregated together.
The `|+|` (Semigroup) operator for key-value pair RDD will execute a full-outer-join
on the key and combine values using the default Semigroup for the value type.

In `cats`:

```scala mdoc
Map(1 -> 2, 2 -> 3) |+| Map(1 -> 4, 2 -> -1)
```

```scala mdoc:invisible
spark.stop()
```


================================================
FILE: docs/FeatureOverview.md
================================================
# TypedDataset: Feature Overview

This tutorial introduces `TypedDataset` using a simple example.
The following imports are needed to make all code examples compile.

```scala mdoc:silent:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import frameless.functions.aggregate._
import frameless.TypedDataset

val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
```

## Creating TypedDataset instances

We start by defining a case class:

```scala mdoc:silent
case class Apartment(city: String, surface: Int, price: Double, bedrooms: Int)
```

And few `Apartment` instances:

```scala mdoc:silent
val apartments = Seq(
  Apartment("Paris", 50,  300000.0, 2),
  Apartment("Paris", 100, 450000.0, 3),
  Apartment("Paris", 25,  250000.0, 1),
  Apartment("Lyon",  83,  200000.0, 2),
  Apartment("Lyon",  45,  133000.0, 1),
  Apartment("Nice",  74,  325000.0, 3)
)
```

We are now ready to instantiate a `TypedDataset[Apartment]`:

```scala mdoc
val aptTypedDs = TypedDataset.create(apartments)
```

We can also create one from an existing Spark `Dataset`:

```scala mdoc:nest
val aptDs = spark.createDataset(apartments)
val aptTypedDs = TypedDataset.create(aptDs)
```

Or use the Frameless syntax:

```scala mdoc
import frameless.syntax._

val aptTypedDs2 = aptDs.typed
```

## Typesafe column referencing

This is how we select a particular column from a `TypedDataset`:

```scala mdoc
val cities: TypedDataset[String] = aptTypedDs.select(aptTypedDs('city))
```

This is completely type-safe, for instance suppose we misspell `city` as `citi`:

```scala mdoc:fail
aptTypedDs.select(aptTypedDs('citi))
```

This gets raised at compile time, whereas with the standard `Dataset` API the error appears at runtime (enjoy the stack trace):

```scala mdoc:crash
aptDs.select('citi)
```

`select()` supports arbitrary column operations:

```scala mdoc
aptTypedDs.select(aptTypedDs('surface) * 10, aptTypedDs('surface) + 2).show().run()
```

Note that unlike the standard Spark API, where some operations are lazy and some are not, **all TypedDatasets operations are lazy.**
In the above example, `show()` is lazy. It requires to apply `run()` for the `show` job to materialize.
A more detailed explanation of `Job` is given [here](Job.md).

Next we compute the price by surface unit:

```scala mdoc:fail
val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface))
```

As the error suggests, we can't divide a `TypedColumn` of `Double` by `Int.`
For safety, in Frameless only math operations between same types is allowed:

```scala mdoc
val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface).cast[Double])
priceBySurfaceUnit.collect().run()
```

Looks like it worked, but that `cast` seems unsafe right? Actually it is safe.
Let's try to cast a `TypedColumn` of `String` to `Double`:

```scala mdoc:fail
aptTypedDs('city).cast[Double]
```

The compile-time error tells us that to perform the cast, an evidence
(in the form of `CatalystCast[String, Double]`) must be available.
Since casting from `String` to `Double` is not allowed, this results
in a compilation error.

Check [here](https://github.com/typelevel/frameless/blob/master/core/src/main/scala/frameless/CatalystCast.scala)
for the set of available `CatalystCast.`

## Working with Optional columns

When working with real data we have to deal with imperfections, such as missing fields. Columns that may have
missing data should be represented using `Options`. For this example, let's assume that the Apartments dataset
may have missing values.  

```scala mdoc:silent
case class ApartmentOpt(city: Option[String], surface: Option[Int], price: Option[Double], bedrooms: Option[Int])
```

```scala mdoc:silent
val apartmentsOpt = Seq(
  ApartmentOpt(Some("Paris"), Some(50),  Some(300000.0), None),
  ApartmentOpt(None, None, Some(450000.0), Some(3))
)
```

```scala mdoc
val aptTypedDsOpt = TypedDataset.create(apartmentsOpt)
aptTypedDsOpt.show().run()
```

Unfortunately the syntax used above with `select()` will not work here:

```scala mdoc:fail
aptTypedDsOpt.select(aptTypedDsOpt('surface) * 10, aptTypedDsOpt('surface) + 2).show().run()
```

This is because we cannot multiple an `Option` with an `Int`. In Scala, `Option` has a `map()` method to help address
exactly this (e.g., `Some(10).map(c => c * 2)`). Frameless follows a similar convention. By applying the `opt` method on 
any `Option[X]` column you can then use `map()` to provide a function that works with the unwrapped type `X`. 
This is best shown in the example bellow:

 ```scala mdoc
 aptTypedDsOpt.select(aptTypedDsOpt('surface).opt.map(c => c * 10), aptTypedDsOpt('surface).opt.map(_ + 2)).show().run()
 ```

**Known issue**: `map()` will throw a runtime exception when the applied function includes a `udf()`. If you want to 
apply a `udf()` to an optional column, we recommend changing your `udf` to work directly with `Optional` fields. 


## Casting and projections

In the general case, `select()` returns a TypedDataset of type `TypedDataset[TupleN[...]]` (with N in `[1...10]`).
For example, if we select three columns with types `String`, `Int`, and `Boolean` the result will have type
`TypedDataset[(String, Int, Boolean)]`. 

We often want to give more expressive types to the result of our computations.
`as[T]` allows us to safely cast a `TypedDataset[U]` to another of type `TypedDataset[T]` as long
as the types in `U` and `T` align.

When the cast is valid the expression compiles:

```scala mdoc
case class UpdatedSurface(city: String, surface: Int)
val updated = aptTypedDs.select(aptTypedDs('city), aptTypedDs('surface) + 2).as[UpdatedSurface]
updated.show(2).run()
```

Next we try to cast a `(String, String)` to an `UpdatedSurface` (which has types `String`, `Int`).
The cast is not valid and the expression does not compile:

```scala mdoc:fail
aptTypedDs.select(aptTypedDs('city), aptTypedDs('city)).as[UpdatedSurface]
```

### Advanced topics with `select()`

When you `select()` a single column that has type `A`, the resulting type is `TypedDataset[A]` and 
not `TypedDataset[Tuple1[A]]`. This behavior makes working with nested schema easier (i.e., in the case 
where `A` is a complex data type) and simplifies type-checking column operations (e.g., verify that two 
columns can be added, divided, etc.). However, when `A` is scalar, say a `Long`, it makes it harder to select 
and work with the resulting `TypedDataset[Long]`. For instance, it's harder to reference this single scalar 
column using `select()`. If this becomes an issue, you can bypass this behavior by using the 
`selectMany()` method instead of `select()`. In the previous example, `selectMany()` will return
`TypedDataset[Tuple1[Long]]` and you can reference its single column using the name `_1`. 
`selectMany()` should also be used when you need to select more than 10 columns. 
`select()` has better IDE support and compiles faster than the macro based `selectMany()`, 
so prefer `select()` for the most common use cases.

When you are handed a single scalar column TypedDataset (e.g., `TypedDataset[Double]`) 
the best way to reference its single column is using the `asCol` (short for "as a column") method. 
This is best shown in the example below. We will see more usages of `asCol` later in this tutorial.  

```scala mdoc:nest
val priceBySurfaceUnit = aptTypedDs.select(aptTypedDs('price) / aptTypedDs('surface).cast[Double])
priceBySurfaceUnit.select(priceBySurfaceUnit.asCol * 2).show(2).run()
```


### Projections

We often want to work with a subset of the fields in a dataset.
Projections allow us to easily select our fields of interest
while preserving their initial names and types for extra safety.

Here is an example using the `TypedDataset[Apartment]` with an additional column:

```scala mdoc
val aptds = aptTypedDs // For shorter expressions

case class ApartmentDetails(city: String, price: Double, surface: Int, ratio: Double)
val aptWithRatio =
  aptds.select(
    aptds('city),
    aptds('price),
    aptds('surface),
    aptds('price) / aptds('surface).cast[Double]
  ).as[ApartmentDetails]
```

Suppose we only want to work with `city` and `ratio`:

```scala mdoc
case class CityInfo(city: String, ratio: Double)

val cityRatio = aptWithRatio.project[CityInfo]

cityRatio.show(2).run()
```

Suppose we only want to work with `price` and `ratio`:

```scala mdoc
case class PriceInfo(ratio: Double, price: Double)

val priceInfo = aptWithRatio.project[PriceInfo]

priceInfo.show(2).run()
```

We see that the order of the fields does not matter as long as the
names and the corresponding types agree. However, if we make a mistake in
any of the names and/or their types, then we get a compilation error.

Say we make a typo in a field name:

```scala mdoc:silent
case class PriceInfo2(ratio: Double, pricEE: Double)
```

```scala mdoc:fail
aptWithRatio.project[PriceInfo2]
```

Say we make a mistake in the corresponding type:

```scala mdoc:silent
case class PriceInfo3(ratio: Int, price: Double) // ratio should be Double
```

```scala mdoc:fail
aptWithRatio.project[PriceInfo3]
```

### Union of TypedDatasets 

Lets create a projection of our original dataset with a subset of the fields.

```scala mdoc:nest:silent
case class ApartmentShortInfo(city: String, price: Double, bedrooms: Int)

val aptTypedDs2: TypedDataset[ApartmentShortInfo] = aptTypedDs.project[ApartmentShortInfo]
```

The union of `aptTypedDs2` with `aptTypedDs` uses all the fields of the caller (`aptTypedDs2`)
and expects the other dataset (`aptTypedDs`) to include all those fields. 
If field names/types do not match you get a compilation error. 

```scala mdoc
aptTypedDs2.union(aptTypedDs).show().run
```

The other way around will not compile, since `aptTypedDs2` has only a subset of the fields. 

```scala mdoc:fail
aptTypedDs.union(aptTypedDs2).show().run
```

Finally, as with `project`, `union` will align fields that have same names/types,
so fields do not have to be in the same order. 

## TypedDataset functions and transformations

Frameless supports many of Spark's functions and transformations. 
However, whenever a Spark function does not exist in Frameless, 
calling `.dataset` will expose the underlying 
`Dataset` (from org.apache.spark.sql, the original Spark APIs), 
where you can use anything that would be missing from the Frameless' API.

These are the main imports for Frameless' aggregate and non-aggregate functions.

```scala
import frameless.functions._                // For literals
import frameless.functions.nonAggregate._   // e.g., concat, abs
import frameless.functions.aggregate._      // e.g., count, sum, avg 
```

### Drop/Replace/Add fields

`dropTupled()` drops a single column and results in a tuple-based schema.

```scala mdoc
aptTypedDs2.dropTupled('price): TypedDataset[(String,Int)]
```

To drop a column and specify a new schema use `drop()`.

```scala mdoc
case class CityBeds(city: String, bedrooms: Int)
val cityBeds: TypedDataset[CityBeds] = aptTypedDs2.drop[CityBeds] 
```

Often, you want to replace an existing column with a new value.
 
```scala mdoc
val inflation = aptTypedDs2.withColumnReplaced('price, aptTypedDs2('price) * 2)
 
inflation.show(2).run()
```

Or use a literal instead.

```scala mdoc
import frameless.functions.lit
aptTypedDs2.withColumnReplaced('price, lit(0.001)) 
```

Adding a column using `withColumnTupled()` results in a tupled-based schema.

```scala mdoc
aptTypedDs2.withColumnTupled(lit(Array("a","b","c"))).show(2).run()
```

Similarly, `withColumn()` adds a column and explicitly expects a schema for the result.

```scala mdoc
case class CityBedsOther(city: String, bedrooms: Int, other: List[String])

cityBeds.
   withColumn[CityBedsOther](lit(List("a","b","c"))).
   show(1).run()
```

To conditionally change a column use the `when/otherwise` operation. 

```scala mdoc
import frameless.functions.nonAggregate.when
aptTypedDs2.withColumnTupled(
   when(aptTypedDs2('city) === "Paris", aptTypedDs2('price)).
   when(aptTypedDs2('city) === "Lyon", lit(1.1)).
   otherwise(lit(0.0))).show(8).run()
```

A simple way to add a column without losing important schema information is
to project the entire source schema into a single column using the `asCol()` method.

```scala mdoc
val c = cityBeds.select(cityBeds.asCol, lit(List("a","b","c")))
c.show(1).run()
```

When working with Spark's `DataFrames`, you often select all columns using `.select($"*", ...)`. 
In a way, `asCol()` is a typed equivalent of `$"*"`. 

To access nested columns, use the `colMany()` method. 

```scala mdoc
c.select(c.colMany('_1, 'city), c('_2)).show(2).run()
```

### Working with collections

```scala mdoc
import frameless.functions._
import frameless.functions.nonAggregate._
```

```scala mdoc
val t = cityRatio.select(cityRatio('city), lit(List("abc","c","d")))
t.withColumnTupled(
   arrayContains(t('_2), "abc")
).show(1).run()
```

If accidentally you apply a collection function on a column that is not a collection,
you get a compilation error.

```scala mdoc:fail
t.withColumnTupled(
   arrayContains(t('_1), "abc")
)
```

Flattening columns in Spark is done with the `explode()` method. Unlike vanilla Spark, 
in Frameless `explode()` is part of `TypedDataset` and not a function of a column. 
This provides additional safety since more than one `explode()` applied in a single 
statement results in runtime error in vanilla Spark.   

```scala mdoc
val t2 = cityRatio.select(cityRatio('city), lit(List(1,2,3,4)))
val flattened = t2.explode('_2): TypedDataset[(String, Int)]
flattened.show(4).run()
```

Here is an example of how `explode()` may fail in vanilla Spark. The Frameless 
implementation does not suffer from this problem since, by design, it can only be applied
to a single column at a time. 

```scala mdoc:fail
{
  import org.apache.spark.sql.functions.{explode => sparkExplode}
  t2.dataset.toDF().select(sparkExplode($"_2"), sparkExplode($"_2"))
}
```

### Collecting data to the driver

In Frameless all Spark actions (such as `collect()`) are safe.

Take the first element from a dataset (if the dataset is empty return `None`).

```scala mdoc
cityBeds.headOption.run()
```

Take the first `n` elements.

```scala mdoc
cityBeds.take(2).run()
```

```scala mdoc
cityBeds.head(3).run()
```

```scala mdoc
cityBeds.limit(4).collect().run()
```

## Sorting columns

Only column types that can be sorted are allowed to be selected for sorting. 

```scala mdoc
aptTypedDs.orderBy(aptTypedDs('city).asc).show(2).run()
```

The ordering can be changed by selecting `.acs` or `.desc`. 

```scala mdoc
aptTypedDs.orderBy(
   aptTypedDs('city).asc, 
   aptTypedDs('price).desc
).show(2).run()
```

## User Defined Functions

Frameless supports lifting any Scala function (up to five arguments) to the
context of a particular `TypedDataset`:

```scala mdoc:nest
// The function we want to use as UDF
val priceModifier =
    (name: String, price:Double) => if(name == "Paris") price * 2.0 else price

val udf = aptTypedDs.makeUDF(priceModifier)

val aptds = aptTypedDs // For shorter expressions

val adjustedPrice = aptds.select(aptds('city), udf(aptds('city), aptds('price)))

adjustedPrice.show().run()
```

## GroupBy and Aggregations
Let's suppose we wanted to retrieve the average apartment price in each city
```scala mdoc
val priceByCity = aptTypedDs.groupBy(aptTypedDs('city)).agg(avg(aptTypedDs('price)))
priceByCity.collect().run()
```
Again if we try to aggregate a column that can't be aggregated, we get a compilation error
```scala mdoc:fail
aptTypedDs.groupBy(aptTypedDs('city)).agg(avg(aptTypedDs('city)))
```

Next, we combine `select` and `groupBy` to calculate the average price/surface ratio per city:

```scala mdoc:nest
val aptds = aptTypedDs // For shorter expressions

val cityPriceRatio =  aptds.select(aptds('city), aptds('price) / aptds('surface).cast[Double])

cityPriceRatio.groupBy(cityPriceRatio('_1)).agg(avg(cityPriceRatio('_2))).show().run()
```

We can also use `pivot` to further group data on a secondary column.
For example, we can compare the average price across cities by number of bedrooms.

```scala mdoc
case class BedroomStats(
   city: String,
   AvgPriceBeds1: Option[Double], // Pivot values may be missing, so we encode them using Options
   AvgPriceBeds2: Option[Double],
   AvgPriceBeds3: Option[Double],
   AvgPriceBeds4: Option[Double])

val bedroomStats = aptds.
   groupBy(aptds('city)).
   pivot(aptds('bedrooms)).
   on(1,2,3,4). // We only care for up to 4 bedrooms
   agg(avg(aptds('price))).
   as[BedroomStats]  // Typesafe casting

bedroomStats.show().run()
```

With pivot, collecting data preserves typesafety by
encoding potentially missing columns with `Option`.

```scala mdoc
bedroomStats.collect().run().foreach(println)
```

#### Working with Optional fields

Optional fields can be converted to non-optional using `getOrElse()`. 

```scala mdoc
val sampleStats = bedroomStats.select(
   bedroomStats('AvgPriceBeds2).getOrElse(0.0),
   bedroomStats('AvgPriceBeds3).getOrElse(0.0))

sampleStats.show().run()   
``` 

In addition, optional columns can be flatten using the `.flattenOption` method on `TypedDatset`.
The result contains the rows for which the flattened column is not None (or null). The schema
is automatically adapted to reflect this change.

```scala mdoc
val flattenStats = bedroomStats.flattenOption('AvgPriceBeds2)


// The second Option[Double] is now of type Double, since all 'null' values are removed
flattenStats: TypedDataset[(String, Option[Double], Double, Option[Double], Option[Double])]
```

In a DataFrame, if you just ignore types, this would equivelantly be written as:

```scala mdoc
bedroomStats.dataset.toDF().filter($"AvgPriceBeds2".isNotNull)
```

### Entire TypedDataset Aggregation

We often want to aggregate the entire `TypedDataset` and skip the `groupBy()` clause.
In Frameless you can do this using the `agg()` operator directly on the `TypedDataset`.
In the following example, we compute the average price, the average surface,
the minimum surface, and the set of cities for the entire dataset.

```scala mdoc
case class Stats(
   avgPrice: Double,
   avgSurface: Double,
   minSurface: Int,
   allCities: Vector[String])

aptds.agg(
   avg(aptds('price)),
   avg(aptds('surface)),
   min(aptds('surface)),
   collectSet(aptds('city))
).as[Stats].show().run()
```

You may apply any `TypedColumn` operation to a `TypedAggregate` column as well.

```scala mdoc
import frameless.functions._
aptds.agg(
   avg(aptds('price)) * min(aptds('surface)).cast[Double], 
   avg(aptds('surface)) * 0.2,
   litAggr("Hello World")
).show().run()
```

## Joins

```scala mdoc:silent
case class CityPopulationInfo(name: String, population: Int)

val cityInfo = Seq(
  CityPopulationInfo("Paris", 2229621),
  CityPopulationInfo("Lyon", 500715),
  CityPopulationInfo("Nice", 343629)
)

val citiInfoTypedDS = TypedDataset.create(cityInfo)
```

Here is how to join the population information to the apartment's dataset:

```scala mdoc
val withCityInfo = aptTypedDs.joinInner(citiInfoTypedDS) { aptTypedDs('city) === citiInfoTypedDS('name) }

withCityInfo.show().run()
```

The joined TypedDataset has type `TypedDataset[(Apartment, CityPopulationInfo)]`.

We can then select which information we want to continue to work with:

```scala mdoc
case class AptPriceCity(city: String, aptPrice: Double, cityPopulation: Int)

withCityInfo.select(
   withCityInfo.colMany('_2, 'name), withCityInfo.colMany('_1, 'price), withCityInfo.colMany('_2, 'population)
).as[AptPriceCity].show().run
```

### Chained Joins

Joins, or any similar operation, may be chained using a thrush combinator removing the need for intermediate values.  Instead of:

```scala mdoc
val withBedroomInfoInterim = aptTypedDs.joinInner(citiInfoTypedDS)( aptTypedDs('city) === citiInfoTypedDS('name) )
val withBedroomInfo = withBedroomInfoInterim 
  .joinLeft(bedroomStats)( withBedroomInfoInterim.col('_1).field('city) === bedroomStats('city) )

withBedroomInfo.show().run()
```

You can use thrush from [mouse](https://github.com/typelevel/mouse):

```scala
libraryDependencies += "org.typelevel" %% "mouse" % "1.2.1"
```

```scala mdoc
import mouse.all._

val withBedroomInfoChained = aptTypedDs.joinInner(citiInfoTypedDS)( aptTypedDs('city) === citiInfoTypedDS('name) )
  .thrush( interim => interim.joinLeft(bedroomStats)( interim.col('_1).field('city) === bedroomStats('city) ) )

withBedroomInfoChained.show().run()
```

```scala mdoc:invisible
spark.stop()
```


================================================
FILE: docs/Injection.md
================================================
# Injection: Creating Custom Encoders

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import frameless.functions.aggregate._
import frameless.TypedDataset

val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
```
Injection lets us define encoders for types that do not have one by injecting `A` into an encodable type `B`.
This is the definition of the injection typeclass:
```scala
trait Injection[A, B] extends Serializable {
  def apply(a: A): B
  def invert(b: B): A
}
```

## Example

Let's define a simple case class:

```scala mdoc
case class Person(age: Int, birthday: java.util.Calendar)
val people = Seq(Person(42, new java.util.GregorianCalendar()))
```

And an instance of a `TypedDataset`:

```scala mdoc:fail:nest
val personDS = TypedDataset.create(people)
```

Looks like we can't, a `TypedEncoder` instance of `Person` is not available, or more precisely for `java.util.Calendar`.
But we can define a injection from `java.util.Calendar` to an encodable type, like `Long`:

```scala mdoc
import java.util.Calendar

import frameless._

implicit val calendarToLongInjection = new Injection[Calendar, Long] {
  def apply(d: Calendar): Long = d.getTime.getTime

  def invert(l: Long): Calendar = {
    val cal = new java.util.GregorianCalendar()
    cal.setTime(new java.util.Date(l))
    cal
  }
}
```

We can be less verbose using the `Injection.apply` function:

```scala mdoc:nest
import frameless._

import java.util.Calendar

implicit val calendarToLongInjection = Injection[Calendar, Long](
  (_: Calendar).getTime.getTime,
  { (l: Long) =>
    val cal = new java.util.GregorianCalendar()
    cal.setTime(new java.util.Date(l))
    cal
  })
```

Now we can create our `TypedDataset`:

```scala mdoc
val personDS = TypedDataset.create(people)
```

```scala mdoc:invisible
spark.stop()
```

## Another example

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import frameless.functions.aggregate._
import frameless.TypedDataset

val conf = new SparkConf().
  setMaster("local[*]").
  setAppName("frameless repl").
  set("spark.ui.enabled", "false")

implicit val spark = SparkSession.builder().
  config(conf).appName("REPL").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
```

Let's define a sealed family:

```scala mdoc
sealed trait Gender
case object Male extends Gender
case object Female extends Gender
case object Other extends Gender
```

And a simple case class:

```scala mdoc
case class Person(age: Int, gender: Gender)
val people = Seq(Person(42, Male))
```

Again if we try to create a `TypedDataset`, we get a compilation error.

```scala mdoc:fail:nest
val personDS = TypedDataset.create(people)
```

Let's define an injection instance for `Gender`:

```scala mdoc
import frameless._

implicit val genderToInt: Injection[Gender, Int] = Injection(
  {
    case Male   => 1
    case Female => 2
    case Other  => 3
  },
  {
    case 1 => Male
    case 2 => Female
    case 3 => Other
  })
```

And now we can create our `TypedDataset`:

```scala mdoc
val personDS = TypedDataset.create(people)
```

```scala mdoc:invisible
spark.stop()
```

Alternatively, an injection instance can be derived for sealed families such as `Gender` using the following 
import, `import frameless.TypedEncoder.injections._`. This will encode the data constructors as strings.

**Known issue**: An invalid injection instance will be derived if there are data constructors with the same name.
For example, consider the following sealed family:

```scala mdoc
sealed trait Foo
object A { case object Bar extends Foo }
object B { case object Bar extends Foo }
```

`A.Bar` and `B.Bar` will both be encoded as `"Bar"` thereby breaking the law that `invert(apply(x)) == x`.


================================================
FILE: docs/Job.md
================================================
# Job\[A\]

All operations on `TypedDataset` are lazy. An operation either returns a new
transformed `TypedDataset` or an `F[A]`, where `F[_]` is a type constructor
with an instance of the `SparkDelay` typeclass and `A` is the result of running a
non-lazy computation in Spark. 

A default such type constructor called `Job` is provided by Frameless. 

`Job` serves several functions:
- Makes all operations on a `TypedDataset` lazy, which makes them more predictable compared to having
few operations being lazy and other being strict
- Allows the programmer to make expensive blocking operations explicit
- Allows for Spark jobs to be lazily sequenced using monadic composition via for-comprehension
- Provides an obvious place where you can annotate/name your Spark jobs to make it easier
to track different parts of your application in the Spark UI

The toy example showcases the use of for-comprehension to explicitly sequences Spark Jobs.
First we calculate the size of the `TypedDataset` and then we collect to the driver
exactly 20% of its elements:

```scala mdoc:invisible
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import frameless.functions.aggregate._
import frameless.TypedDataset

val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
```

```scala mdoc
import frameless.syntax._

val ds = TypedDataset.create(1 to 20)

val countAndTakeJob =
  for {
    count <- ds.count()
    sample <- ds.take((count/5).toInt)
  } yield sample

countAndTakeJob.run()
```

The `countAndTakeJob` can either be executed using `run()` (as we show above) or it can
be passed along to other parts of the program to be further composed into more complex sequences
of Spark jobs.

```scala mdoc
import frameless.Job
def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min)

val finalJob = computeMinOfSample(countAndTakeJob)
```

Now we can execute this new job by specifying a [group-id][group-id] and a description.
This allows the programmer to see this information on the Spark UI and help track, say,
performance issues.

```scala mdoc
finalJob.
  withGroupId("samplingJob").
  withDescription("Samples 20% of elements and computes the min").
  run()
```


```scala mdoc:invisible
spark.stop()
```

[group-id]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.SparkContext@setJobGroup(groupId:String,description:String,interruptOnCancel:Boolean):Unit

## More on `SparkDelay`

As mentioned above, `SparkDelay[F[_]]` is a typeclass required for suspending
effects by Spark computations. This typeclass represents the ability to suspend
an `=> A` thunk into an `F[A]` value, while implicitly capturing a `SparkSession`.

As it is a typeclass, it is open for implementation by the user in order to use
other data types for suspension of effects. The `cats` module, for example, uses
this typeclass to support suspending Spark computations in any effect type that
has a `cats.effect.Sync` instance.


================================================
FILE: docs/TypedDataFrame.md
================================================
# Proof of Concept: TypedDataFrame

`TypedDataFrame` is the API developed in the early stages of Frameless to manipulate Spark `DataFrame`s in a type-safe manner. With the introduction of `Dataset` in Spark 1.6, `DataFrame` seems deprecated and won't be the focus of future development of Frameless. However, the design is interesting enough to document.

To safely manipulate `DataFrame`s we use a technique called a *shadow type*, which consists in storing additional information about a value in a "dummy" type. Mirroring value-level computation at the type-level lets us leverage the type system to catch common mistakes at compile time.

### Diving in

In `TypedDataFrame`, we use a single `Schema <: Product` to model the number, the types and the names of columns. Here is a what the definition of `TypedDataFrame` looks like, with simplified type signatures:

```scala
import org.apache.spark.sql.DataFrame
import shapeless.HList

class TDataFrame[Schema <: Product](df: DataFrame) {
  def filter(predicate: Schema => Boolean): TDataFrame[Schema] = ???

  def select[C <: HList, Out <: Product](columns: C): TDataFrame[Out] = ???

  def innerJoin[OtherS <: Product, Out <: Product]
    (other: TDataFrame[OtherS]): TDataFrame[Out] = ???

  // Followed by equivalent of every DataFrame method with improved signature
}
```

As you can see, instead of the `def filter(conditionExpr: String): DataFrame` defined in Spark, the `TypedDataFrame` version expects a function from `Schema` to `Boolean`, and models the fact that resulting `DataFrame` will still hold elements of type `Schema`.

### Type-level column referencing

For Spark's `DataFrame`s, column referencing is done directly by `String`s or using the `Column` type which provides no additional type safety. `TypedDataFrame` improves on that by catching invalid column references compile type. When everything goes well, Frameless select is very similar to vanilla select, except that it keeps track of the selected column types:

```scala
import frameless.TypedDataFrame

case class Foo(s: String, d: Double, i: Int)

def selectIntString(tf: TypedDataFrame[Foo]): TypedDataFrame[(Int, String)] =
  tf.select('i, 's)
```

However, in case of typo, it gets caught right away:

```scala
def selectIntStringTypo(tf: TypedDataFrame[Foo]): TypedDataFrame[(Int, String)] =
  tf.select('j, 's)
```

### Type-level joins

Joins can available with two different syntaxes. The first lets you reference different columns on each `TypedDataFrame`, and ensures that they all exist and have compatible types:

```scala
case class Bar(i: Int, j: String, b: Boolean)

def join1(tf1: TypedDataFrame[Foo], tf2: TypedDataFrame[Bar])
    : TypedDataFrame[(String, Double, Int, Int, String, Boolean)] =
  tf1.innerJoin(tf2).on('s).and('j)
```

The second syntax brings some convenience when the joining columns have identical names in both tables:

```scala
def join2(tf1: TypedDataFrame[Foo], tf2: TypedDataFrame[Bar])
    : TypedDataFrame[(String, Double, Int, String, Boolean)] =
  tf1.innerJoin(tf2).using('i)
```

Further example are available in the [TypedDataFrame join tests.](https://github.com/typelevel/frameless/blob/17194d2172e75f8994e9481181e85b4c8dcc0f69/dataframe/src/test/scala/JoinTests.scala)

### Complete example

We now consider a complete example to see how the Frameless types can improve not only correctness but also the readability of Spark jobs. Consider the following domain of phonebooks, city maps and neighborhoods:

```scala mdoc:silent
type Neighborhood = String
type Address = String

case class PhoneBookEntry(
  address: Address,
  residents: String,
  phoneNumber: Double
)

case class CityMapEntry(
  address: Address,
  neighborhood: Neighborhood
)
```

Our goal will be to compute the neighborhood with unique names, approximating "unique" with names containing less common
letters in the alphabet: 'x', 'q', and 'z'. We are going to need a natural language processing library at some point, so
let's use the following for the example:

```scala mdoc:silent
object NLPLib {
  def uniqueName(name: String): Boolean = name.exists(Set('x', 'q', 'z'))
}
```

Suppose we manage to obtain public data for a `TypedDataFrame[PhoneBookEntry]` and `TypedDataFrame[CityMapEntry]`. Here is what our Spark job could look like with Frameless:

```scala
import org.apache.spark.sql.SQLContext

// These case classes are used to hold intermediate results
case class Family(residents: String, neighborhood: Neighborhood)
case class Person(name: String, neighborhood: Neighborhood)
case class NeighborhoodCount(neighborhood: Neighborhood, count: Long)

def bestNeighborhood
  (phoneBookTF: TypedDataFrame[PhoneBookEntry], cityMapTF: TypedDataFrame[CityMapEntry])
  (implicit c: SQLContext): String = {
                                          (((((((((
  phoneBookTF
    .innerJoin(cityMapTF).using('address) :TypedDataFrame[(Address, String, Double, String)])
    .select('_2, '_4)                     :TypedDataFrame[(String, String)])
    .as[Family]()                         :TypedDataFrame[Family])
    .flatMap { f =>
      f.residents.split(' ').map(r => Person(r, f.neighborhood))
    }                                     :TypedDataFrame[Person])
    .filter { p =>
      NLPLib.uniqueName(p.name)
    }                                     :TypedDataFrame[Person])
    .groupBy('neighborhood).count()       :TypedDataFrame[(String, Long)])
    .as[NeighborhoodCount]()              :TypedDataFrame[NeighborhoodCount])
    .sortDesc('count)                     :TypedDataFrame[NeighborhoodCount])
    .select('neighborhood)                :TypedDataFrame[Tuple1[String]])
    .head._1
}
```

If you compare this version to vanilla Spark where every line is a `DataFrame`, you see how much types can improve readability. An executable version of this example is available in the [BestNeighborhood test](https://github.com/typelevel/frameless/blob/17194d2172e75f8994e9481181e85b4c8dcc0f69/dataframe/src/test/scala/BestNeighborhood.scala).

### Limitations

The main limitation of this approach comes from Scala 2.10, which limits the arity of class classes to 22. Because of the way `DataFrame` models joins, joining two table with more that 11 fields results in a `DataFrame` which not representable with `Schema` of type `Product`.

In the `Dataset` API introduced in Spark 1.6, the way join are handled was rethought to return a pair of both schemas instead of a flat table, which moderates the trouble caused by case class limitations. Alternatively, since Scala 2.11, it is possible to define Tuple23 and onward. Sadly, due to the way Spark is commonly packaged in various systems, the amount Spark users having to Scala 2.11 and *not* to Spark 1.6 is essentially zero. For this reasons, further development in Frameless will target Spark 1.6+, deprecating the early work on`TypedDataFrame`.


================================================
FILE: docs/TypedDatasetVsSparkDataset.md
================================================
# Comparing TypedDatasets with Spark's Datasets

```scala mdoc:invisible:reset-object
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

val conf = new SparkConf().setMaster("local[*]").setAppName("test").set("spark.ui.enabled", "false").set("spark.app.id", "tut-dataset")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()

System.clearProperty("spark.master.port")
System.clearProperty("spark.driver.port")
System.clearProperty("spark.hostPort")
System.setProperty("spark.cleaner.ttl", "300")

// We are using this directory so let's make sure it is clean first
org.apache.commons.io.FileUtils.deleteDirectory(new java.io.File("/tmp/foo/"))
```

**Goal:**
  This tutorial compares the standard Spark Datasets API with the one provided by
  Frameless' `TypedDataset`. It shows how `TypedDataset`s allow for an expressive and
  type-safe api with no compromises on performance.

For this tutorial we first create a simple dataset and save it on disk as a parquet file.
[Parquet](https://parquet.apache.org/) is a popular columnar format and well supported by Spark.
It's important to note that when operating on parquet datasets, Spark knows that each column is stored
separately, so if we only need a subset of the columns Spark will optimize for this and avoid reading
the entire dataset. This is a rather simplistic view of how Spark and parquet work together but it
will serve us well for the context of this discussion.

```scala mdoc
import spark.implicits._

// Our example case class Foo acting here as a schema
case class Foo(i: Long, j: String)

// Assuming spark is loaded and SparkSession is bind to spark
val initialDs = spark.createDataset( Foo(1, "Q") :: Foo(10, "W") :: Foo(100, "E") :: Nil )

// Assuming you are on Linux or Mac OS
initialDs.write.parquet("/tmp/foo")

val ds = spark.read.parquet("/tmp/foo").as[Foo]

ds.show()
```

The value `ds` holds the content of the `initialDs` read from a parquet file.
Let's try to only use field `i` from Foo and see how Spark's Catalyst (the query optimizer)
optimizes this.

```scala mdoc
// Using a standard Spark TypedColumn in select()
val filteredDs = ds.filter($"i" === 10).select($"i".as[Long])

filteredDs.show()
```

The `filteredDs` is of type `Dataset[Long]`. Since we only access field `i` from `Foo` the type is correct.
Unfortunately, this syntax requires handholding by explicitly setting the `TypedColumn` in the `select` statement
to return type `Long` (look at the `as[Long]` statement). We will discuss this limitation next in more detail.
Now, let's take a quick look at the optimized Physical Plan that Spark's Catalyst generated.

```scala mdoc
filteredDs.explain()
```

The last line is very important (see `ReadSchema`). The schema read
from the parquet file only required reading column `i` without needing to access column `j`.
This is great! We have both an optimized query plan and type-safety!

Unfortunately, this syntax is not bulletproof: it fails at run-time if we try to access
a non existing column `x`:


```scala mdoc:crash
ds.filter($"i" === 10).select($"x".as[Long])
```

There are two things to improve here. First, we would want to avoid the `as[Long]` casting that we are required
to type for type-safety. This is clearly an area where we may introduce a bug by casting to an incompatible
type. Second, we want a solution where reference to a non existing column name fails at compilation time.
The standard Spark Dataset can achieve this using the following syntax.

```scala mdoc
ds.filter(_.i == 10).map(_.i).show()
```

This looks great! It reminds us the familiar syntax from Scala.
The two closures in filter and map are functions that operate on `Foo` and the
compiler will helps us capture all the mistakes we mentioned above.

```scala mdoc:fail
ds.filter(_.i == 10).map(_.x).show()
```

Unfortunately, this syntax does not allow Spark to optimize the code.

```scala mdoc
ds.filter(_.i == 10).map(_.i).explain()
```

As we see from the explained Physical Plan, Spark was not able to optimize our query as before.
Reading the parquet file will required loading all the fields of `Foo`. This might be ok for
small datasets or for datasets with few columns, but will be extremely slow for most practical
applications. Intuitively, Spark currently does not have a way to look inside the code we pass in these two
closures. It only knows that they both take one argument of type `Foo`, but it has no way of knowing if
we use just one or all of `Foo`'s fields.

The `TypedDataset` in Frameless solves this problem. It allows for a simple and type-safe syntax
with a fully optimized query plan.

```scala mdoc
import frameless.TypedDataset
import frameless.syntax._
val fds = TypedDataset.create(ds)

fds.filter(fds('i) === 10).select(fds('i)).show().run()
```

And the optimized Physical Plan:

```scala mdoc
fds.filter(fds('i) === 10).select(fds('i)).explain()
```

And the compiler is our friend.

```scala mdoc:fail
fds.filter(fds('i) === 10).select(fds('x))
```

## Differences in Encoders

Encoders in Spark's `Datasets` are partially type-safe. If you try to create a `Dataset` using  a type that is not 
 a Scala `Product` then you get a compilation error:

```scala mdoc
class Bar(i: Int)
```

`Bar` is neither a case class nor a `Product`, so the following correctly gives a compilation error in Spark:

```scala mdoc:fail
spark.createDataset(Seq(new Bar(1)))
```

However, the compile type guards implemented in Spark are not sufficient to detect non encodable members. 
For example, using the following case class leads to a runtime failure:

```scala mdoc
case class MyDate(jday: java.util.Calendar)
```

```scala mdoc:crash
spark.createDataset(Seq(MyDate {
  val cal = new java.util.GregorianCalendar()
  cal.setTime(new java.util.Date(System.currentTimeMillis))
  cal
}))
```

In comparison, a `TypedDataset` will notify about the encoding problem at compile time: 

```scala mdoc:fail
TypedDataset.create(Seq(MyDate {
  val cal = new java.util.GregorianCalendar()
  cal.setTime(new java.util.Date(System.currentTimeMillis))
  cal
}))
```

## Aggregate vs Projected columns 

Spark's `Dataset` do not distinguish between columns created from aggregate operations, 
such as summing or averaging, and simple projections/selections. 
This is problematic when you start mixing the two.

```scala mdoc
import org.apache.spark.sql.functions.sum
```

```scala mdoc:crash
ds.select(sum($"i"), $"i"*2)
```

In Frameless, mixing the two results in a compilation error.

```scala mdoc
// To avoid confusing frameless' sum with the standard Spark's sum
import frameless.functions.aggregate.{sum => fsum}
```

```scala mdoc:fail
fds.select(fsum(fds('i)))
```

As the error suggests, we expected a `TypedColumn` but we got a `TypedAggregate` instead. 

Here is how you apply an aggregation method in Frameless: 

```scala mdoc
fds.agg(fsum(fds('i))+22).show().run()
```

Similarly, mixing projections while aggregating does not make sense, and in Frameless
you get a compilation error.  

```scala mdoc:fail
fds.agg(fsum(fds('i)), fds('i)).show().run()
```


```scala mdoc:invisible
org.apache.commons.io.FileUtils.deleteDirectory(new java.io.File("/tmp/foo/"))
spark.stop()
```


================================================
FILE: docs/TypedEncoder.md
================================================
# Typed Encoders in Frameless

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import frameless.functions.aggregate._
val conf = new SparkConf().setMaster("local[*]").setAppName("frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
System.setProperty("spark.cleaner.ttl", "300")
```

Spark uses Reflection to derive its `Encoder`s, which is why they can fail at run time. For example, because Spark does not support `java.util.Calendar`, the following leads to an error:

```scala mdoc:silent
import java.util.Calendar

import org.apache.spark.sql.Dataset

import spark.implicits._

case class DateRange(s: Calendar, e: Calendar)
```

```scala mdoc:crash
def now = new java.util.GregorianCalendar()

val ds: Dataset[DateRange] = Seq(DateRange(now, now)).toDS()
```

As shown by the stack trace, this runtime error goes through [ScalaReflection](https://github.com/apache/spark/blob/19cf208063f035d793d2306295a251a9af7e32f6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala) to try to derive an `Encoder` for `Dataset` schema. Beside the annoyance of not detecting this error at compile time, a more important limitation of the reflection-based approach is its inability to be extended for custom types. See this Stack Overflow question for a summary of the current situation (as of 2.0) in vanilla Spark: [How to store custom objects in a Dataset?](http://stackoverflow.com/a/39442829/2311362).

Frameless introduces a new type class called `TypeEncoder` to solve these issues. `TypeEncoder`s are passed around as implicit parameters to every Frameless method to ensure that the data being manipulated is `Encoder`. It uses a standard implicit resolution coupled with shapeless' type class derivation mechanism to ensure every that compiling code manipulates encodable data. For example, the `java.util.Calendar` example won't compile with Frameless:

```scala mdoc:silent
import frameless.TypedDataset
import frameless.syntax._
```

```scala mdoc:fail
def now = new java.util.GregorianCalendar()

val ds: TypedDataset[DateRange] = TypedDataset.create(Seq(DateRange(now, now)))
```

Type class derivation takes care of recursively constructing (and proving the existence of) `TypeEncoder`s for case classes. The following works as expected:

```scala mdoc
case class Bar(d: Double, s: String)
case class Foo(i: Int, b: Bar)

val ds: TypedDataset[Foo] = 
  TypedDataset.create(Seq(Foo(1, Bar(1.1, "s"))))

ds.collect()
```

But any non-encodable in the case class hierarchy will be detected at compile time:

```scala mdoc:silent
case class BarDate(d: Double, s: String, t: java.util.Calendar)
case class FooDate(i: Int, b: BarDate)
```

```scala mdoc:fail
val ds: TypedDataset[FooDate] = TypedDataset.create(
  Seq(FooDate(1, BarDate(1.1, "s", new java.util.GregorianCalendar))))
```

It should be noted that once derived, reflection-based `Encoder`s and implicitly derived `TypeEncoder`s have identical performance.
The derivation mechanism is different, but the objects generated to encode and decode JVM objects in Spark's internal representation behave the same at runtime.

```scala mdoc:invisible
spark.stop()
```


================================================
FILE: docs/TypedML.md
================================================
# Typed Spark ML

The `frameless-ml` module provides a strongly typed Spark ML API leveraging `TypedDataset`s. It introduces `TypedTransformer`s
and `TypedEstimator`s, the type-safe equivalents of Spark ML's `Transformer` and `Estimator`. 

A `TypedEstimator` fits models to data, i.e trains a ML model based on an input `TypedDataset`. 
A `TypedTransformer` transforms one `TypedDataset` into another, usually by appending column(s) to it.

By calling the `fit` method of a `TypedEstimator`, the `TypedEstimator` will train a ML model using the `TypedDataset` 
passed as input (representing the training data) and will return a `TypedTransformer` that represents the trained model. 
This `TypedTransformer`can then be used to make predictions on an input `TypedDataset` (representing the test data) 
using the `transform` method that will return a new `TypedDataset` with appended prediction column(s).

Both `TypedEstimator` and `TypedTransformer` check at compile-time the correctness of their inputs field names and types,
contrary to Spark ML API which only deals with DataFrames (the data structure with the lowest level of type-safety in Spark).

`frameless-ml` adds type-safety to Spark ML API but stays very close to it in terms of abstractions and API calls, so 
please check [Spark ML documentation](https://spark.apache.org/docs/2.2.0/ml-pipeline.html) for more details 
on `Transformer`s and `Estimator`s.

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession

val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
```

## Example 1: predict a continuous value using a `TypedRandomForestRegressor`

In this example, we want to predict the sale price of a house depending on its square footage and the fact that the house
has a garden or not. We will use a `TypedRandomForestRegressor`.

### Training

As with the Spark ML API, we use a `TypedVectorAssembler` (the type-safe equivalent of `VectorAssembler`)
to compute feature vectors:

```scala mdoc:silent
import frameless._
import frameless.syntax._
import frameless.ml._
import frameless.ml.feature._
import frameless.ml.regression._
import org.apache.spark.ml.linalg.Vector
```

```scala mdoc
case class HouseData(squareFeet: Double, hasGarden: Boolean, price: Double)

val trainingData = TypedDataset.create(Seq(
  HouseData(20, false, 100000),
  HouseData(50, false, 200000),
  HouseData(50, true, 250000),
  HouseData(100, true, 500000)
))

case class Features(squareFeet: Double, hasGarden: Boolean)
val assembler = TypedVectorAssembler[Features]

case class HouseDataWithFeatures(squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector)
val trainingDataWithFeatures = assembler.transform(trainingData).as[HouseDataWithFeatures]
```

In the above code snippet, `.as[HouseDataWithFeatures]` is a `TypedDataset`'s type-safe cast
(see [TypedDataset: Feature Overview](https://typelevel.org/frameless/FeatureOverview.html)):

```scala mdoc:silent
case class WrongHouseFeatures(
  squareFeet: Double,
  hasGarden: Int, // hasGarden has wrong type
  price: Double,
  features: Vector
)
```

```scala mdoc:fail
assembler.transform(trainingData).as[WrongHouseFeatures]
```

Moreover, `TypedVectorAssembler[Features]` will compile only if `Features` contains exclusively fields of type Numeric or Boolean:

```scala mdoc:silent
case class WrongFeatures(squareFeet: Double, hasGarden: Boolean, city: String)
```

```scala mdoc:fail
TypedVectorAssembler[WrongFeatures]
```

The subsequent call `assembler.transform(trainingData)` compiles only if `trainingData` contains all fields (names and types)
of `Features`:

```scala mdoc
case class WrongHouseData(squareFeet: Double, price: Double) // hasGarden is missing
val wrongTrainingData = TypedDataset.create(Seq(WrongHouseData(20, 100000)))
```

```scala mdoc:fail
assembler.transform(wrongTrainingData)
```

Then, we train the model. To train a Random Forest, one needs to feed it with features (what we predict from) and
with a label (what we predict). In our example, `price` is the label, `features` are the features:

```scala mdoc
case class RFInputs(price: Double, features: Vector)
val rf = TypedRandomForestRegressor[RFInputs]

val model = rf.fit(trainingDataWithFeatures).run()
```

`TypedRandomForestRegressor[RFInputs]` compiles only if `RFInputs`
contains only one field of type Double (the label) and one field of type Vector (the features):

```scala mdoc:silent
case class WrongRFInputs(labelOfWrongType: String, features: Vector)
```

```scala mdoc:fail
TypedRandomForestRegressor[WrongRFInputs]
```

The subsequent `rf.fit(trainingDataWithFeatures)` call compiles only if `trainingDataWithFeatures` contains the same fields
(names and types) as RFInputs.

```scala mdoc
val wrongTrainingDataWithFeatures = TypedDataset.create(Seq(HouseData(20, false, 100000))) // features are missing
```

```scala mdoc:fail
rf.fit(wrongTrainingDataWithFeatures) 
```

### Prediction

We now want to predict `price` for `testData` using the previously trained model. Like the Spark ML API,
`testData` has a default value for `price` (`0` in our case) that will be ignored at prediction time. We reuse
our `assembler` to compute the feature vector of `testData`.

```scala mdoc
val testData = TypedDataset.create(Seq(HouseData(70, true, 0)))
val testDataWithFeatures = assembler.transform(testData).as[HouseDataWithFeatures]

case class HousePricePrediction(
  squareFeet: Double,
  hasGarden: Boolean,
  price: Double,
  features: Vector,
  predictedPrice: Double
)
val predictions = model.transform(testDataWithFeatures).as[HousePricePrediction]

predictions.select(predictions.col('predictedPrice)).collect.run()
```

`model.transform(testDataWithFeatures)` will only compile if `testDataWithFeatures` contains a field `price` of type Double
and a field `features` of type Vector:

```scala mdoc:fail
model.transform(testData)
```

```scala mdoc:invisible
spark.stop()
```

## Example 2: predict a categorical value using a `TypedRandomForestClassifier`

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession

val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._
import frameless._
import frameless.syntax._
import frameless.ml._
import frameless.ml.feature._
import frameless.ml.regression._
import org.apache.spark.ml.linalg.Vector
```

In this example, we want to predict in which city a house is located depending on its price and its square footage. We use a
`TypedRandomForestClassifier`.

### Training

As with the Spark ML API, we use a `TypedVectorAssembler` to compute feature vectors and a `TypedStringIndexer`
to index `city` values in order to be able to pass them to a `TypedRandomForestClassifier`
(which only accepts Double values as label):

```scala mdoc:silent
import frameless.ml.classification._
```

```scala mdoc
case class HouseData(squareFeet: Double, city: String, price: Double)

val trainingData = TypedDataset.create(Seq(
  HouseData(100, "lyon", 100000),
  HouseData(200, "lyon", 200000),
  HouseData(100, "san francisco", 500000),
  HouseData(150, "san francisco", 900000)
))

case class Features(price: Double, squareFeet: Double)
val vectorAssembler = TypedVectorAssembler[Features]

case class HouseDataWithFeatures(squareFeet: Double, city: String, price: Double, features: Vector)
val dataWithFeatures = vectorAssembler.transform(trainingData).as[HouseDataWithFeatures]

case class StringIndexerInput(city: String)
val indexer = TypedStringIndexer[StringIndexerInput]
indexer.estimator.setHandleInvalid("keep")
val indexerModel = indexer.fit(dataWithFeatures).run()

case class HouseDataWithFeaturesAndIndex(
  squareFeet: Double,
  city: String,
  price: Double,
  features: Vector,
  cityIndexed: Double
)
val indexedData = indexerModel.transform(dataWithFeatures).as[HouseDataWithFeaturesAndIndex]
```

Then, we train the model:

```scala mdoc
case class RFInputs(cityIndexed: Double, features: Vector)
val rf = TypedRandomForestClassifier[RFInputs]

val model = rf.fit(indexedData).run()
```

### Prediction

We now want to predict `city` for `testData` using the previously trained model. Like the Spark ML API,
`testData` has a default value for `city` (empty string in our case) that will be ignored at prediction time. We reuse
our `vectorAssembler` to compute the feature vector of `testData` and our `indexerModel` to index `city`.

```scala mdoc
val testData = TypedDataset.create(Seq(HouseData(120, "", 800000)))

val testDataWithFeatures = vectorAssembler.transform(testData).as[HouseDataWithFeatures]
val indexedTestData = indexerModel.transform(testDataWithFeatures).as[HouseDataWithFeaturesAndIndex]

case class HouseCityPredictionInputs(features: Vector, cityIndexed: Double)
val testInput = indexedTestData.project[HouseCityPredictionInputs]

case class HouseCityPredictionIndexed(
  features: Vector,
  cityIndexed: Double,
  rawPrediction: Vector,
  probability: Vector,
  predictedCityIndexed: Double
)
val indexedPredictions = model.transform(testInput).as[HouseCityPredictionIndexed]
```

Then, we use a `TypedIndexToString` to get back a String value from `predictedCityIndexed`. `TypedIndexToString` takes
as input the label array computed by our previous `indexerModel`:

```scala mdoc
case class IndexToStringInput(predictedCityIndexed: Double)

val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels)

case class HouseCityPrediction(
  features: Vector,
  cityIndexed: Double,
  rawPrediction: Vector,
  probability: Vector,
  predictedCityIndexed: Double,
  predictedCity: String
)
val predictions = indexToString.transform(indexedPredictions).as[HouseCityPrediction]

predictions.select(predictions.col('predictedCity)).collect.run()
```

## List of currently implemented `TypedEstimator`s

* `TypedRandomForestClassifier`
* `TypedRandomForestRegressor`
* ... [your contribution here](https://github.com/typelevel/frameless/issues/215) ... :)

## List of currently implemented `TypedTransformer`s

* `TypedIndexToString`
* `TypedStringIndexer`
* `TypedVectorAssembler`
* ... [your contribution here](https://github.com/typelevel/frameless/issues/215) ... :)
 
## Using Vector and Matrix with `TypedDataset`

`frameless-ml` provides `TypedEncoder` instances for `org.apache.spark.ml.linalg.Vector` 
and `org.apache.spark.ml.linalg.Matrix`:

```scala mdoc:silent
import frameless._
import frameless.ml._
import org.apache.spark.ml.linalg._
```

```scala mdoc
val vector = Vectors.dense(1, 2, 3)
val vectorDs = TypedDataset.create(Seq("label" -> vector))

val matrix = Matrices.dense(2, 1, Array(1, 2))
val matrixDs = TypedDataset.create(Seq("label" -> matrix))
```

Under the hood, Vector and Matrix are encoded using `org.apache.spark.ml.linalg.VectorUDT` 
and `org.apache.spark.ml.linalg.MatrixUDT`. This is possible thanks to the implicit derivation 
from `org.apache.spark.sql.types.UserDefinedType[A]` to `TypedEncoder[A]` defined in `TypedEncoder` companion object.

```scala mdoc:invisible
spark.stop()
```


================================================
FILE: docs/WorkingWithCsvParquetJson.md
================================================
# Working with CSV and Parquet data

```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession

val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._

val testDataPath: String = "docs/iris.data"
```
You need these imports for most Frameless projects. 

```scala mdoc:silent
import frameless._
import frameless.syntax._
import frameless.functions.aggregate._
```

## Working with CSV 

We first load some CSV data and print the schema. 

```scala mdoc
val df = spark.read.format("csv").load(testDataPath)
df.show(2)
df.printSchema
```

The easiest way to read from CSV into a `TypedDataset` is to create a case class that follows 
the exact number, type, and order for the fields as they appear in the CSV file. This is shown in 
the example bellow with the use of the `Iris` case class.

```scala mdoc
final case class Iris(sLength: Double, sWidth: Double, pLength: Double, pWidth: Double, kind: String)
val testDataDf = spark.read.format("csv").schema(TypedExpressionEncoder[Iris].schema).load(testDataPath)
val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataDf)
data.show(2).run()
```

If we do not explicitly define the schema of the CSV file then the types will not match leading to runtime errors. 

```scala mdoc:nest
val testDataNoSchema = spark.read.format("csv").load(testDataPath)
val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataNoSchema)
```

```scala mdoc:crash
data.collect().run()
```

### Dealing with CSV files with multiple columns

When the dataset has many columns, it is impractical to define a case class that contains many columns we don't need. 
In such case, we can project the columns we do need, cast them to the proper type, and then call `createUnsafe` using a case class
that contains a much smaller subset of the columns.  

```scala mdoc:nest
import org.apache.spark.sql.types.DoubleType
final case class IrisLight(kind: String, sLength: Double)

val testDataDf = spark.read.format("csv").load(testDataPath)
val projectedDf = testDataDf.select(testDataDf("_c4").as("kind"), testDataDf("_c1").cast(DoubleType).as("sLength"))
val data = TypedDataset.createUnsafe[IrisLight](projectedDf)
data.take(2).run()
```

```scala mdoc:invisible
spark.stop()
```

## Working with Parquet
```scala mdoc:invisible:reset-object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession

val conf = new SparkConf().setMaster("local[*]").setAppName("Frameless repl").set("spark.ui.enabled", "false")
implicit val spark = SparkSession.builder().config(conf).appName("REPL").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

import spark.implicits._

val testDataPathParquet: String = "docs/iris.parquet"
import frameless._
import frameless.syntax._
import frameless.functions.aggregate._

final case class Iris(sLength: Double, sWidth: Double, pLength: Double, pWidth: Double, kind: String)
```

Spark is much better at reading the schema from parquet files. 

```scala mdoc
val testDataParquet = spark.read.format("parquet").load(testDataPathParquet)
testDataParquet.printSchema
```

So as long as we use a type (case class) that reflects the same number, type, and order of the fields 
from the data everything works as expected. 

```scala mdoc:nest
val data: TypedDataset[Iris] = TypedDataset.createUnsafe[Iris](testDataParquet)
data.take(2).run()
```

### Dealing with Parquet files with multiple columns

The main difference compared to CSV is that with Parquet Spark is better at inferring the types. This makes it simpler 
to project the columns we need without having the cast the to the proper type. 

```scala mdoc:nest
final case class IrisLight(kind: String, sLength: Double)

val projectedDf = testDataParquet.select("kind", "sLength")
val data = TypedDataset.createUnsafe[IrisLight](projectedDf)
data.take(2).run()
```

```scala mdoc:invisible
spark.stop()
```


================================================
FILE: docs/directory.conf
================================================
laika.title = frameless
laika.navigationOrder = [
  README.md
  FeatureOverview.md
  TypedDatasetVsSparkDataset.md
  WorkingWithCsvParquetJson.md
  Injection.md
  Job.md
  Cats.md
  TypedML.md
  TypedDataFrame.md
]

================================================
FILE: docs/iris.data
================================================
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica


================================================
FILE: github.sbt
================================================
ThisBuild / githubWorkflowArtifactUpload := false // doesn't work with scoverage

ThisBuild / githubWorkflowEnv += "SPARK_LOCAL_IP" -> "localhost"

ThisBuild / githubWorkflowArtifactDownloadExtraKeys += "project"

ThisBuild / githubWorkflowBuildSbtStepPreamble += s"project $${{ matrix.project }}"
ThisBuild / tlCiScalafmtCheck := true
ThisBuild / githubWorkflowBuild ~= { steps =>
  steps.map { // replace the test step
    case step: WorkflowStep.Sbt if step.commands == List("test") =>
      WorkflowStep.Sbt(
        commands = List("coverage", "test", "test/coverageReport"),
        name = Some("Test & Compute Coverage")
      )
    case step => step
  }
}

ThisBuild / githubWorkflowBuildPostamble +=
  WorkflowStep.Use(
    UseRef.Public(
      "codecov",
      "codecov-action",
      "v3"
    ),
    params = Map("flags" -> s"$${{ matrix.scala }}-$${{ matrix.project }}")
  )


================================================
FILE: ml/src/main/scala/frameless/ml/TypedEstimator.scala
================================================
package frameless
package ml

import frameless.ops.SmartProject
import org.apache.spark.ml.{Estimator, Model}

/**
  * A TypedEstimator fits models to data.
  */
trait TypedEstimator[Inputs, Outputs, M <: Model[M]] {
  val estimator: Estimator[M]

  def fit[T, F[_]](ds: TypedDataset[T])(
    implicit
    smartProject: SmartProject[T, Inputs],
    F: SparkDelay[F]
  ): F[AppendTransformer[Inputs, Outputs, M]] = {
    implicit val sparkSession = ds.dataset.sparkSession
    F.delay {
      val inputDs = smartProject.apply(ds)
      val model = estimator.fit(inputDs.dataset)
      new AppendTransformer[Inputs, Outputs, M] {
        val transformer: M = model
      }
    }
  }
}


================================================
FILE: ml/src/main/scala/frameless/ml/TypedTransformer.scala
================================================
package frameless
package ml

import frameless.ops.SmartProject
import org.apache.spark.ml.Transformer
import shapeless.{Generic, HList}
import shapeless.ops.hlist.{Prepend, Tupler}

/**
  * A TypedTransformer transforms one TypedDataset into another.
  */
sealed trait TypedTransformer

/**
  * An AppendTransformer `transform` method takes as input a TypedDataset containing `Inputs` and
  * return a TypedDataset with `Outputs` columns appended to the input TypedDataset.
  */
trait AppendTransformer[Inputs, Outputs, InnerTransformer <: Transformer] extends TypedTransformer {
  val transformer: InnerTransformer

  def transform[T, TVals <: HList, OutputsVals <: HList, OutVals <: HList, Out](ds: TypedDataset[T])(
    implicit
    i0: SmartProject[T, Inputs],
    i1: Generic.Aux[T, TVals],
    i2: Generic.Aux[Outputs, OutputsVals],
    i3: Prepend.Aux[TVals, OutputsVals, OutVals],
    i4: Tupler.Aux[OutVals, Out],
    i5: TypedEncoder[Out]
  ): TypedDataset[Out] = {
    val transformed = transformer.transform(ds.dataset).as[Out](TypedExpressionEncoder[Out])
    TypedDataset.create[Out](transformed)
  }

}

object AppendTransformer {
  // Random name to a temp column added by a TypedTransformer (the proper name will be given by the Tuple-based encoder)
  private[ml] val tempColumnName = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMI"
  private[ml] val tempColumnName2 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMJ"
  private[ml] val tempColumnName3 = "I1X3T9CU1OP0128JYIO76TYZZA3AXHQ18RMK"
}


================================================
FILE: ml/src/main/scala/frameless/ml/classification/TypedRandomForestClassifier.scala
================================================
package frameless
package ml
package classification

import frameless.ml.internals.TreesInputsChecker
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.linalg.Vector

/**
  * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a> learning algorithm for
  * classification.
  * It supports both binary and multiclass labels, as well as both continuous and categorical
  * features.
  */
final class TypedRandomForestClassifier[Inputs] private[ml](
  rf: RandomForestClassifier,
  labelCol: String,
  featuresCol: String
) extends TypedEstimator[Inputs, TypedRandomForestClassifier.Outputs, RandomForestClassificationModel] {

  val estimator: RandomForestClassifier =
    rf
      .setLabelCol(labelCol)
      .setFeaturesCol(featuresCol)
      .setPredictionCol(AppendTransformer.tempColumnName)
      .setRawPredictionCol(AppendTransformer.tempColumnName2)
      .setProbabilityCol(AppendTransformer.tempColumnName3)

  def setNumTrees(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setNumTrees(value))
  def setMaxDepth(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxDepth(value))
  def setMinInfoGain(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInfoGain(value))
  def setMinInstancesPerNode(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInstancesPerNode(value))
  def setMaxMemoryInMB(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxMemoryInMB(value))
  def setSubsamplingRate(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setSubsamplingRate(value))
  def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestClassifier[Inputs] =
    copy(rf.setFeatureSubsetStrategy(value.sparkValue))
  def setMaxBins(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxBins(value))

  private def copy(newRf: RandomForestClassifier): TypedRandomForestClassifier[Inputs] =
    new TypedRandomForestClassifier[Inputs](newRf, labelCol, featuresCol)
}

object TypedRandomForestClassifier {
  case class Outputs(rawPrediction: Vector, probability: Vector, prediction: Double)

  def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs]): TypedRandomForestClassifier[Inputs] = {
    new TypedRandomForestClassifier(new RandomForestClassifier(), inputsChecker.labelCol, inputsChecker.featuresCol)
  }
}


================================================
FILE: ml/src/main/scala/frameless/ml/clustering/TypedBisectingKMeans.scala
================================================
package frameless
package ml
package classification

import frameless.ml.internals.VectorInputsChecker
import org.apache.spark.ml.clustering.{BisectingKMeans, BisectingKMeansModel}

/**
  * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques"
  * by Steinbach, Karypis, and Kumar, with modification to fit Spark.
  * The algorithm starts from a single cluster that contains all points.
  * Iteratively it finds divisible clusters on the bottom level and bisects each of them using
  * k-means, until there are `k` leaf clusters in total or no leaf clusters are divisible.
  * The bisecting steps of clusters on the same level are grouped together to increase parallelism.
  * If bisecting all divisible clusters on the bottom level would result more than `k` leaf clusters,
  * larger clusters get higher priority.
  *
  * @see <a href="http://glaros.dtc.umn.edu/gkhome/fetch/papers/docclusterKDDTMW00.pdf">
  * Steinbach, Karypis, and Kumar, A comparison of document clustering techniques,
  * KDD Workshop on Text Mining, 2000.</a>
  */
class TypedBisectingKMeans[Inputs] private[ml] (
  bkm: BisectingKMeans,
  featuresCol: String
) extends TypedEstimator[Inputs,TypedBisectingKMeans.Output, BisectingKMeansModel]{
  val estimator: BisectingKMeans =
    bkm
    .setFeaturesCol(featuresCol)
    .setPredictionCol(AppendTransformer.tempColumnName)
  
  def setK(value: Int): TypedBisectingKMeans[Inputs] = copy(bkm.setK(value))
  
  def setMaxIter(value: Int): TypedBisectingKMeans[Inputs] = copy(bkm.setMaxIter(value))

  def setMinDivisibleClusterSize(value: Double): TypedBisectingKMeans[Inputs] =
    copy(bkm.setMinDivisibleClusterSize(value))
  
  def setSeed(value: Long): TypedBisectingKMeans[Inputs] = copy(bkm.setSeed(value))

  private def copy(newBkm: BisectingKMeans): TypedBisectingKMeans[Inputs] =
    new TypedBisectingKMeans[Inputs](newBkm, featuresCol)
}

object TypedBisectingKMeans {
  case class Output(prediction: Int)

  def apply[Inputs]()(implicit inputsChecker: VectorInputsChecker[Inputs]): TypedBisectingKMeans[Inputs] =
    new TypedBisectingKMeans(new BisectingKMeans(), inputsChecker.featuresCol)
}

================================================
FILE: ml/src/main/scala/frameless/ml/clustering/TypedKMeans.scala
================================================
package frameless
package ml
package classification

import frameless.ml.internals.VectorInputsChecker
import frameless.ml.params.kmeans.KMeansInitMode
import org.apache.spark.ml.clustering.{KMeans, KMeansModel}

/**
  * K-means clustering with support for k-means|| initialization proposed by Bahmani et al.
  *
  * @see <a href="http://dx.doi.org/10.14778/2180912.2180915">Bahmani et al., Scalable k-means++.</a>
  */
class TypedKMeans[Inputs] private[ml] (
  km: KMeans,
  featuresCol: String
) extends TypedEstimator[Inputs,TypedKMeans.Output,KMeansModel] {
  val estimator: KMeans =
    km
      .setFeaturesCol(featuresCol)
      .setPredictionCol(AppendTransformer.tempColumnName)

  def setK(value: Int): TypedKMeans[Inputs] = copy(km.setK(value))

  def setInitMode(value: KMeansInitMode): TypedKMeans[Inputs] = copy(km.setInitMode(value.sparkValue))

  def setInitSteps(value: Int): TypedKMeans[Inputs] = copy(km.setInitSteps(value))

  def setMaxIter(value: Int): TypedKMeans[Inputs] = copy(km.setMaxIter(value))

  def setTol(value: Double): TypedKMeans[Inputs] = copy(km.setTol(value))

  def setSeed(value: Long): TypedKMeans[Inputs] = copy(km.setSeed(value))

  private def copy(newKmeans: KMeans): TypedKMeans[Inputs] = new TypedKMeans[Inputs](newKmeans, featuresCol)

}

object TypedKMeans{
  case class Output(prediction: Int)

  def apply[Inputs](implicit inputsChecker: VectorInputsChecker[Inputs]): TypedKMeans[Inputs] = {
    new TypedKMeans(new KMeans(), inputsChecker.featuresCol)
  }
}


================================================
FILE: ml/src/main/scala/frameless/ml/feature/TypedIndexToString.scala
================================================
package frameless
package ml
package feature

import frameless.ml.internals.UnaryInputsChecker
import org.apache.spark.ml.feature.IndexToString

/**
  * A `TypedTransformer` that maps a column of indices back to a new column of corresponding
  * string values.
  * The index-string mapping must be supplied when creating the `TypedIndexToString`.
  *
  * @see `TypedStringIndexer` for converting strings into indices
  */
final class TypedIndexToString[Inputs] private[ml](indexToString: IndexToString, inputCol: String)
  extends AppendTransformer[Inputs, TypedIndexToString.Outputs, IndexToString] {

  val transformer: IndexToString =
    indexToString
      .setInputCol(inputCol)
      .setOutputCol(AppendTransformer.tempColumnName)

}

object TypedIndexToString {
  case class Outputs(originalOutput: String)

  def apply[Inputs](labels: Array[String])
                    (implicit inputsChecker: UnaryInputsChecker[Inputs, Double]): TypedIndexToString[Inputs] = {
    new TypedIndexToString[Inputs](new IndexToString().setLabels(labels), inputsChecker.inputCol)
  }
}

================================================
FILE: ml/src/main/scala/frameless/ml/feature/TypedStringIndexer.scala
================================================
package frameless
package ml
package feature

import frameless.ml.feature.TypedStringIndexer.HandleInvalid
import frameless.ml.internals.UnaryInputsChecker
import org.apache.spark.ml.feature.{StringIndexer, StringIndexerModel}

/**
  * A label indexer that maps a string column of labels to an ML column of label indices.
  * The indices are in [0, numLabels), ordered by label frequencies.
  * So the most frequent label gets index 0.
  *
  * @see `TypedIndexToString` for the inverse transformation
  */
final class TypedStringIndexer[Inputs] private[ml](stringIndexer: StringIndexer, inputCol: String)
  extends TypedEstimator[Inputs, TypedStringIndexer.Outputs, StringIndexerModel] {

  val estimator: StringIndexer = stringIndexer
    .setInputCol(inputCol)
    .setOutputCol(AppendTransformer.tempColumnName)

  def setHandleInvalid(value: HandleInvalid): TypedStringIndexer[Inputs] = copy(stringIndexer.setHandleInvalid(value.sparkValue))

  private def copy(newStringIndexer: StringIndexer): TypedStringIndexer[Inputs] =
    new TypedStringIndexer[Inputs](newStringIndexer, inputCol)
}

object TypedStringIndexer {
  case class Outputs(indexedOutput: Double)

  sealed abstract class HandleInvalid(val sparkValue: String)
  object HandleInvalid {
    case object Error extends HandleInvalid("error")
    case object Skip extends HandleInvalid("skip")
    case object Keep extends HandleInvalid("keep")
  }

  def apply[Inputs](implicit inputsChecker: UnaryInputsChecker[Inputs, String]): TypedStringIndexer[Inputs] = {
    new TypedStringIndexer[Inputs](new StringIndexer(), inputsChecker.inputCol)
  }
}

================================================
FILE: ml/src/main/scala/frameless/ml/feature/TypedVectorAssembler.scala
================================================
package frameless
package ml
package feature

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vector
import shapeless.{HList, HNil, LabelledGeneric}
import shapeless.ops.hlist.ToTraversable
import shapeless.ops.record.{Keys, Values}
import shapeless._
import scala.annotation.implicitNotFound

/**
  * A feature transformer that merges multiple columns into a vector column.
  */
final class TypedVectorAssembler[Inputs] private[ml](vectorAssembler: VectorAssembler, inputCols: Array[String])
  extends AppendTransformer[Inputs, TypedVectorAssembler.Output, VectorAssembler] {

  val transformer: VectorAssembler = vectorAssembler
    .setInputCols(inputCols)
    .setOutputCol(AppendTransformer.tempColumnName)

}

object TypedVectorAssembler {
  case class Output(vector: Vector)

  def apply[Inputs](implicit inputsChecker: TypedVectorAssemblerInputsChecker[Inputs]): TypedVectorAssembler[Inputs] = {
    new TypedVectorAssembler(new VectorAssembler(), inputsChecker.inputCols.toArray)
  }
}

@implicitNotFound(
  msg = "Cannot prove that ${Inputs} is a valid input type. Input type must only contain fields of numeric or boolean types."
)
private[ml] trait TypedVectorAssemblerInputsChecker[Inputs] {
  val inputCols: Seq[String]
}

private[ml] object TypedVectorAssemblerInputsChecker {
  implicit def checkInputs[Inputs, InputsRec <: HList, InputsKeys <: HList, InputsVals <: HList](
    implicit
    inputsGen: LabelledGeneric.Aux[Inputs, InputsRec],
    inputsKeys: Keys.Aux[InputsRec, InputsKeys],
    inputsKeysTraverse: ToTraversable.Aux[InputsKeys, Seq, Symbol],
    inputsValues: Values.Aux[InputsRec, InputsVals],
    inputsTypeCheck: TypedVectorAssemblerInputsValueChecker[InputsVals]
  ): TypedVectorAssemblerInputsChecker[Inputs] = new TypedVectorAssemblerInputsChecker[Inputs] {
    val inputCols: Seq[String] = inputsKeys.apply().to[Seq].map(_.name)
  }
}

private[ml] trait TypedVectorAssemblerInputsValueChecker[InputsVals]

private[ml] object TypedVectorAssemblerInputsValueChecker {
  implicit def hnilCheckInputsValue: TypedVectorAssemblerInputsValueChecker[HNil] =
    new TypedVectorAssemblerInputsValueChecker[HNil] {}

  implicit def hlistCheckInputsValueNumeric[H, T <: HList](
    implicit ch: CatalystNumeric[H],
    tt: TypedVectorAssemblerInputsValueChecker[T]
  ): TypedVectorAssemblerInputsValueChecker[H :: T] = new TypedVectorAssemblerInputsValueChecker[H :: T] {}

  implicit def hlistCheckInputsValueBoolean[T <: HList](
    implicit tt: TypedVectorAssemblerInputsValueChecker[T]
  ): TypedVectorAssemblerInputsValueChecker[Boolean :: T] = new TypedVectorAssemblerInputsValueChecker[Boolean :: T] {}
}


================================================
FILE: ml/src/main/scala/frameless/ml/internals/LinearInputsChecker.scala
================================================
package frameless
package ml
package internals

import org.apache.spark.ml.linalg._
import shapeless.ops.hlist.Length
import shapeless.{HList, LabelledGeneric, Nat, Witness}

import scala.annotation.implicitNotFound

/**
  * Can be used for linear reg algorithm
  */
@implicitNotFound(
  msg = "Cannot prove that ${Inputs} is a valid input type. " +
    "Input type must only contain a field of type Double (the label) and a field of type " +
    "org.apache.spark.ml.linalg.Vector (the features) and optional field of float type (weight)."
)
trait LinearInputsChecker[Inputs] {
  val featuresCol: String
  val labelCol: String
  val weightCol: Option[String]
}

object LinearInputsChecker {

  implicit def checkLinearInputs[
  Inputs,
  InputsRec <: HList,
  LabelK <: Symbol,
  FeaturesK <: Symbol](
    implicit
    i0: LabelledGeneric.Aux[Inputs, InputsRec],
    i1: Length.Aux[InputsRec, Nat._2],
    i2: SelectorByValue.Aux[InputsRec, Double, LabelK],
    i3: Witness.Aux[LabelK],
    i4: SelectorByValue.Aux[InputsRec, Vector, FeaturesK],
    i5: Witness.Aux[FeaturesK]
  ): LinearInputsChecker[Inputs] = {
    new LinearInputsChecker[Inputs] {
      val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name
      val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name
      val weightCol: Option[String] = None
    }
  }

  implicit def checkLinearInputs2[
  Inputs,
  InputsRec <: HList,
  LabelK <: Symbol,
  FeaturesK <: Symbol,
  WeightK <: Symbol](
    implicit
    i0: LabelledGeneric.Aux[Inputs, InputsRec],
    i1: Length.Aux[InputsRec, Nat._3],
    i2: SelectorByValue.Aux[InputsRec, Vector, FeaturesK],
    i3: Witness.Aux[FeaturesK],
    i4: SelectorByValue.Aux[InputsRec, Double, LabelK],
    i5: Witness.Aux[LabelK],
    i6: SelectorByValue.Aux[InputsRec, Float, WeightK],
    i7: Witness.Aux[WeightK]
  ): LinearInputsChecker[Inputs] = {
    new LinearInputsChecker[Inputs] {
      val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name
      val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name
      val weightCol: Option[String] = Some(implicitly[Witness.Aux[WeightK]].value.name)
    }
  }

}


================================================
FILE: ml/src/main/scala/frameless/ml/internals/SelectorByValue.scala
================================================
package frameless
package ml
package internals

import shapeless.labelled.FieldType
import shapeless.{::, DepFn1, HList, Witness}

/**
  * Typeclass supporting record selection by value type (returning the first key whose value is of type `Value`)
  */
trait SelectorByValue[L <: HList, Value] extends DepFn1[L] with Serializable { type Out <: Symbol }

object SelectorByValue {
  type Aux[L <: HList, Value, Out0 <: Symbol] = SelectorByValue[L, Value] { type Out = Out0 }

  implicit def select[K <: Symbol, T <: HList, Value](implicit wk: Witness.Aux[K]): Aux[FieldType[K, Value] :: T, Value, K] = {
    new SelectorByValue[FieldType[K, Value] :: T, Value] {
      type Out = K
      def apply(l: FieldType[K, Value] :: T): Out = wk.value
    }
  }

  implicit def recurse[H, T <: HList, Value](implicit st: SelectorByValue[T, Value]): Aux[H :: T, Value, st.Out] = {
    new SelectorByValue[H :: T, Value] {
      type Out = st.Out
      def apply(l: H :: T): Out = st(l.tail)
    }
  }
}


================================================
FILE: ml/src/main/scala/frameless/ml/internals/TreesInputsChecker.scala
================================================
package frameless
package ml
package internals

import shapeless.ops.hlist.Length
import shapeless.{HList, LabelledGeneric, Nat, Witness}
import org.apache.spark.ml.linalg._

import scala.annotation.implicitNotFound

/**
  * Can be used for all tree-based ML algorithm (decision tree, random forest, gradient-boosted trees)
  */
@implicitNotFound(
  msg = "Cannot prove that ${Inputs} is a valid input type. " +
    "Input type must only contain a field of type Double (the label) and a field of type " +
    "org.apache.spark.ml.linalg.Vector (the features)."
)
trait TreesInputsChecker[Inputs] {
  val featuresCol: String
  val labelCol: String
}

object TreesInputsChecker {

  implicit def checkTreesInputs[
  Inputs,
  InputsRec <: HList,
  LabelK <: Symbol,
  FeaturesK <: Symbol](
    implicit
    i0: LabelledGeneric.Aux[Inputs, InputsRec],
    i1: Length.Aux[InputsRec, Nat._2],
    i2: SelectorByValue.Aux[InputsRec, Double, LabelK],
    i3: Witness.Aux[LabelK],
    i4: SelectorByValue.Aux[InputsRec, Vector, FeaturesK],
    i5: Witness.Aux[FeaturesK]
  ): TreesInputsChecker[Inputs] = {
    new TreesInputsChecker[Inputs] {
      val labelCol: String = implicitly[Witness.Aux[LabelK]].value.name
      val featuresCol: String = implicitly[Witness.Aux[FeaturesK]].value.name
    }
  }

}


================================================
FILE: ml/src/main/scala/frameless/ml/internals/UnaryInputsChecker.scala
================================================
package frameless
package ml
package internals

import shapeless.ops.hlist.Length
import shapeless.{HList, LabelledGeneric, Nat, Witness}

import scala.annotation.implicitNotFound

/**
  * Can be used for all unary transformers (i.e almost all of them)
  */
@implicitNotFound(
  msg = "Cannot prove that ${Inputs} is a valid input type. Input type must have only one field of type ${Expected}"
)
trait UnaryInputsChecker[Inputs, Expected] {
  val inputCol: String
}

object UnaryInputsChecker {

  implicit def checkUnaryInputs[Inputs, Expected, InputsRec <: HList, InputK <: Symbol](
    implicit
    i0: LabelledGeneric.Aux[Inputs, InputsRec],
    i1: Length.Aux[InputsRec, Nat._1],
    i2: SelectorByValue.Aux[InputsRec, Expected, InputK],
    i3: Witness.Aux[InputK]
  ): UnaryInputsChecker[Inputs, Expected] = new UnaryInputsChecker[Inputs, Expected] {
    val inputCol: String = implicitly[Witness.Aux[InputK]].value.name
  }

}


================================================
FILE: ml/src/main/scala/frameless/ml/internals/VectorInputsChecker.scala
================================================
package frameless
package ml
package internals

import shapeless.ops.hlist.Length
import shapeless.{HList, LabelledGeneric, Nat, Witness}

import scala.annotation.implicitNotFound
import org.apache.spark.ml.linalg.Vector

/** Can be used whenever algorithm requires only vector */
@implicitNotFound(
  msg = "Cannot prove that ${Inputs} is a valid input type. " +
    "Input type must only contain a field of type org.apache.spark.ml.linalg.Vector (the features)."
)
trait VectorInputsChecker[Inputs] {
  val featuresCol: String
}

object VectorInputsChecker {
  implicit def checkVectorInput[Inputs, InputsRec <: HList, FeaturesK <: Symbol](
    implicit
      i0: LabelledGeneric.Aux[Inputs, InputsRec],
      i1: Length.Aux[InputsRec, Nat._1],
      i2: SelectorByValue.Aux[InputsRec, Vector, FeaturesK],
      i3: Witness.Aux[FeaturesK]
    ): VectorInputsChecker[Inputs] = {
      new VectorInputsChecker[Inputs] {
        val featuresCol: String = i3.value.name
      }
    }
}


================================================
FILE: ml/src/main/scala/frameless/ml/package.scala
================================================
package frameless

import org.apache.spark.sql.FramelessInternals.UserDefinedType
import org.apache.spark.ml.FramelessInternals
import org.apache.spark.ml.linalg.{Matrix, Vector}

package object ml {

  implicit val mlVectorUdt: UserDefinedType[Vector] = FramelessInternals.vectorUdt

  implicit val mlMatrixUdt: UserDefinedType[Matrix] = FramelessInternals.matrixUdt

}


================================================
FILE: ml/src/main/scala/frameless/ml/params/kmeans/KMeansInitMode.scala
================================================
package frameless
package ml
package params
package kmeans

/**
  * Param for the initialization algorithm.
  * This can be either "random" to choose random points as
  * initial cluster centers, or "k-means||" to use a parallel variant of k-means++
  * (Bahmani et al., Scalable K-Means++, VLDB 2012).
  * Default: k-means||.
  */

sealed abstract class KMeansInitMode private[ml](val sparkValue: String)

object KMeansInitMode {
  case object Random extends KMeansInitMode("random")
  case object KMeansPlusPlus extends KMeansInitMode("k-means||")
}


================================================
FILE: ml/src/main/scala/frameless/ml/params/linears/LossStrategy.scala
================================================
package frameless
package ml
package params
package linears
/**
  * <a href="https://en.wikipedia.org/wiki/Mean_squared_error">SquaredError</a>  measures the average of the squares of the errors—that is,
  * the average squared difference between the estimated values and what is estimated.
  *
  * <a href="https://en.wikipedia.org/wiki/Huber_loss">Huber Loss</a>  loss function less sensitive to outliers in data than the
  * squared error loss
  */
sealed abstract class LossStrategy private[ml](val sparkValue: String)
object LossStrategy {
  case object SquaredError extends LossStrategy("squaredError")
  case object Huber        extends LossStrategy("huber")
}


================================================
FILE: ml/src/main/scala/frameless/ml/params/linears/Solver.scala
================================================
package frameless
package ml
package params
package linears

/**
  * solver algorithm used for optimization.
  *  - "l-bfgs" denotes Limited-memory BFGS which is a limited-memory quasi-Newton
  *    optimization method.
  *  - "normal" denotes using Normal Equation as an analytical solution to the linear regression
  *    problem.  This solver is limited to `LinearRegression.MAX_FEATURES_FOR_NORMAL_SOLVER`.
  *  - "auto" (default) means that the solver algorithm is selected automatically.
  *    The Normal Equations solver will be used when possible, but this will automatically fall
  *    back to iterative optimization methods when needed.
  *
  *    spark
  */

sealed abstract class Solver private[ml](val sparkValue: String)
object Solver {
  case object LBFGS   extends Solver("l-bfgs")
  case object Auto    extends Solver("auto")
  case object Normal  extends Solver("normal")
}


================================================
FILE: ml/src/main/scala/frameless/ml/params/trees/FeatureSubsetStrategy.scala
================================================
package frameless
package ml
package params
package trees
/**
  * The number of features to consider for splits at each tree node.
  * Supported options:
  *  - Auto: Choose automatically for task:
  *            If numTrees == 1, set to All
  *            If numTrees > 1 (forest), set to Sqrt for classification and
  *              to OneThird for regression.
  *  - All: use all features
  *  - OneThird: use 1/3 of the features
  *  - Sqrt: use sqrt(number of features)
  *  - Log2: use log2(number of features)
  *  - Ratio: use (ratio * number of features) features
  *  - NumberOfFeatures: use numberOfFeatures features.
  * (default = Auto)
  *
  * These various settings are based on the following references:
  *  - log2: tested in Breiman (2001)
  *  - sqrt: recommended by Breiman manual for random forests
  *  - The defaults of sqrt (classification) and onethird (regression) match the R randomForest
  *    package.
  *
  * @see <a href="http://www.stat.berkeley.edu/~breiman/randomforest2001.pdf">Breiman (2001)</a>
  * @see <a href="http://www.stat.berkeley.edu/~breiman/Using_random_forests_V3.1.pdf">
  * Breiman manual for random forests</a>
  */
sealed abstract class FeatureSubsetStrategy private[ml](val sparkValue: String)
object FeatureSubsetStrategy {
  case object Auto extends FeatureSubsetStrategy("auto")
  case object All extends FeatureSubsetStrategy("all")
  case object OneThird extends FeatureSubsetStrategy("onethird")
  case object Sqrt extends FeatureSubsetStrategy("sqrt")
  case object Log2 extends FeatureSubsetStrategy("log2")
  case class Ratio(value: Double) extends FeatureSubsetStrategy(value.toString)
  case class NumberOfFeatures(value: Int) extends FeatureSubsetStrategy(value.toString)
}

================================================
FILE: ml/src/main/scala/frameless/ml/regression/TypedLinearRegression.scala
================================================
package frameless
package ml
package regression

import frameless.ml.internals.LinearInputsChecker
import frameless.ml.params.linears.{LossStrategy, Solver}
import frameless.ml.{AppendTransformer, TypedEstimator}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}

/**
  * <a href="https://en.wikipedia.org/wiki/Linear_regression">Linear Regression</a>  linear approach to modelling the relationship
  * between a scalar response (or dependent variable) and one or more explanatory variables
  */
final class TypedLinearRegression [Inputs] private[ml](
  lr: LinearRegression,
  labelCol: String,
  featuresCol: String,
  weightCol: Option[String]
) extends TypedEstimator[Inputs, TypedLinearRegression.Outputs, LinearRegressionModel] {

  val estimatorWithoutWeight : LinearRegression = lr
    .setLabelCol(labelCol)
    .setFeaturesCol(featuresCol)
    .setPredictionCol(AppendTransformer.tempColumnName)

  val estimator = if (weightCol.isDefined) estimatorWithoutWeight.setWeightCol(weightCol.get) else estimatorWithoutWeight

  def setRegParam(value: Double):           TypedLinearRegression[Inputs] = copy(lr.setRegParam(value))
  def setFitIntercept(value: Boolean):      TypedLinearRegression[Inputs] = copy(lr.setFitIntercept(value))
  def setStandardization(value: Boolean):   TypedLinearRegression[Inputs] = copy(lr.setStandardization(value))
  def setElasticNetParam(value: Double):    TypedLinearRegression[Inputs] = copy(lr.setElasticNetParam(value))
  def setMaxIter(value: Int):               TypedLinearRegression[Inputs] = copy(lr.setMaxIter(value))
  def setTol(value: Double):                TypedLinearRegression[Inputs] = copy(lr.setTol(value))
  def setSolver(value: Solver):             TypedLinearRegression[Inputs] = copy(lr.setSolver(value.sparkValue))
  def setAggregationDepth(value: Int):      TypedLinearRegression[Inputs] = copy(lr.setAggregationDepth(value))
  def setLoss(value: LossStrategy):         TypedLinearRegression[Inputs] = copy(lr.setLoss(value.sparkValue))
  def setEpsilon(value: Double):            TypedLinearRegression[Inputs] = copy(lr.setEpsilon(value))

  private def copy(newLr: LinearRegression): TypedLinearRegression[Inputs] =
    new TypedLinearRegression[Inputs](newLr, labelCol, featuresCol, weightCol)

}

object TypedLinearRegression {
  case class Outputs(prediction: Double)
  case class Weight(weight: Double)


  def apply[Inputs](implicit inputsChecker: LinearInputsChecker[Inputs]): TypedLinearRegression[Inputs] = {
    new TypedLinearRegression(new LinearRegression(), inputsChecker.labelCol, inputsChecker.featuresCol, inputsChecker.weightCol)
  }
}

================================================
FILE: ml/src/main/scala/frameless/ml/regression/TypedRandomForestRegressor.scala
================================================
package frameless
package ml
package regression

import frameless.ml.internals.TreesInputsChecker
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}

/**
  * <a href="http://en.wikipedia.org/wiki/Random_forest">Random Forest</a>
  * learning algorithm for regression.
  * It supports both continuous and categorical features.
  */
final class TypedRandomForestRegressor[Inputs] private[ml](
  rf: RandomForestRegressor,
  labelCol: String,
  featuresCol: String
) extends TypedEstimator[Inputs, TypedRandomForestRegressor.Outputs, RandomForestRegressionModel] {

  val estimator: RandomForestRegressor =
    rf
      .setLabelCol(labelCol)
      .setFeaturesCol(featuresCol)
      .setPredictionCol(AppendTransformer.tempColumnName)

  def setNumTrees(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setNumTrees(value))
  def setMaxDepth(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxDepth(value))
  def setMinInfoGain(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInfoGain(value))
  def setMinInstancesPerNode(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMinInstancesPerNode(value))
  def setMaxMemoryInMB(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxMemoryInMB(value))
  def setSubsamplingRate(value: Double): TypedRandomForestRegressor[Inputs] = copy(rf.setSubsamplingRate(value))
  def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestRegressor[Inputs] =
    copy(rf.setFeatureSubsetStrategy(value.sparkValue))
  def setMaxBins(value: Int): TypedRandomForestRegressor[Inputs] = copy(rf.setMaxBins(value))

  private def copy(newRf: RandomForestRegressor): TypedRandomForestRegressor[Inputs] =
    new TypedRandomForestRegressor[Inputs](newRf, labelCol, featuresCol)
}

object TypedRandomForestRegressor {
  case class Outputs(prediction: Double)

  def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs])
  : TypedRandomForestRegressor[Inputs] = {
    new TypedRandomForestRegressor(new RandomForestRegressor(), inputsChecker.labelCol, inputsChecker.featuresCol)
  }
}

================================================
FILE: ml/src/main/scala/org/apache/spark/ml/FramelessInternals.scala
================================================
package org.apache.spark.ml

import org.apache.spark.ml.linalg.{MatrixUDT, VectorUDT}

object FramelessInternals {

  // because org.apache.spark.ml.linalg.VectorUDT is private[spark]
  val vectorUdt = new VectorUDT

  // because org.apache.spark.ml.linalg.MatrixUDT is private[spark]
  val matrixUdt = new MatrixUDT

}


================================================
FILE: ml/src/test/scala/frameless/ml/FramelessMlSuite.scala
================================================
package frameless
package ml

import org.scalactic.anyvals.PosZInt
import org.scalatest.BeforeAndAfterAll
import org.scalatestplus.scalacheck.Checkers
import org.scalatest.funsuite.AnyFunSuite

class FramelessMlSuite extends AnyFunSuite with Checkers with BeforeAndAfterAll with SparkTesting {
  // Limit size of generated collections and number of checks because Travis
  implicit override val generatorDrivenConfig =
    PropertyCheckConfiguration(sizeRange = PosZInt(10), minSize = PosZInt(10))
  implicit val sparkDelay: SparkDelay[Job] = Job.framelessSparkDelayForJob
}


================================================
FILE: ml/src/test/scala/frameless/ml/Generators.scala
================================================
package frameless
package ml

import frameless.ml.params.linears.{LossStrategy, Solver}
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}
import org.scalacheck.{Arbitrary, Gen}

object Generators {

  implicit val arbVector: Arbitrary[Vector] = Arbitrary {
    val genDenseVector = Gen.listOf(arbDouble.arbitrary).suchThat(_.nonEmpty).map(doubles => Vectors.dense(doubles.toArray))
    val genSparseVector = genDenseVector.map(_.toSparse)

    Gen.oneOf(genDenseVector, genSparseVector)
  }

  implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary {
    Gen.sized { size =>
      for {
        nbRows <- Gen.choose(0, size)
        nbCols <- Gen.choose(1, size)
        matrix <- {
          Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary)
            .map(values => Matrices.dense(nbRows, nbCols, values.toArray))
        }
      } yield matrix
    }
  }

  implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary {
    val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio)
    val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures)

    Gen.oneOf(Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.Log2),
      Gen.const(FeatureSubsetStrategy.OneThird),
      Gen.const(FeatureSubsetStrategy.Sqrt),
      genRatio,
      genNumberOfFeatures
    )
  }

  implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary {
      Gen.const(LossStrategy.SquaredError)
  }

  implicit val arbSolver: Arbitrary[Solver] = Arbitrary {
    Gen.oneOf(
      Gen.const(Solver.LBFGS),
      Gen.const(Solver.Auto),
      Gen.const(Solver.Normal)
    )
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/TypedEncoderInstancesTests.scala
================================================
package frameless
package ml

import org.scalacheck.Prop._
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.regression.DecisionTreeRegressor
import Generators._
import scala.util.Random

class TypedEncoderInstancesTests extends FramelessMlSuite {

  test("Vector encoding is injective using collect()") {
    val prop = forAll { vector: Vector =>
      TypedDataset.create(Seq(vector)).collect().run() == Seq(vector)
    }
    check(prop)
  }

  test("Matrix encoding is injective using collect()") {
    val prop = forAll { matrix: Matrix =>
      TypedDataset.create(Seq(matrix)).collect().run() == Seq(matrix)
    }
    check(prop)
  }

  test("Vector is encoded as VectorUDT and thus can be run in a Spark ML model") {
    case class Input(features: Vector, label: Double)

    val prop = forAll { trainingData: Matrix =>
      (trainingData.numRows >= 1) ==> {
        val inputs = trainingData.rowIter.toVector.map(vector => Input(vector, 0D))
        val inputsDS = TypedDataset.create(inputs)

        val model = new DecisionTreeRegressor()

        // this line would throw a runtime exception if Vector was not encoded as VectorUDT
        val trainedModel = model.fit(inputsDS.dataset)

        val randomInput = inputs(Random.nextInt(inputs.length))
        val randomInputDS = TypedDataset.create(Seq(randomInput))

        val prediction = trainedModel.transform(randomInputDS.dataset)
          .select("prediction")
          .head()
          .getAs[Double](0)

        prediction == 0D
      }

    }

    check(prop, MinSize(1))
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/classification/ClassificationIntegrationTests.scala
================================================
package frameless
package ml
package classification

import frameless.ml.feature.{TypedIndexToString, TypedStringIndexer, TypedVectorAssembler}
import org.apache.spark.ml.linalg.Vector
import org.scalatest.matchers.must.Matchers

class ClassificationIntegrationTests extends FramelessMlSuite with Matchers {

  test("predict field3 from field1 and field2 using a RandomForestClassifier") {
    case class Data(field1: Double, field2: Int, field3: String)

    // Training

    val trainingDataDs = TypedDataset.create(Seq.fill(10)(Data(0D, 10, "foo")))

    case class Features(field1: Double, field2: Int)
    val vectorAssembler = TypedVectorAssembler[Features]

    case class DataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector)
    val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[DataWithFeatures]()

    case class StringIndexerInput(field3: String)
    val indexer = TypedStringIndexer[StringIndexerInput]
    val indexerModel = indexer.fit(dataWithFeatures).run()

    case class IndexedDataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector, indexedField3: Double)
    val indexedData = indexerModel.transform(dataWithFeatures).as[IndexedDataWithFeatures]()

    case class RFInputs(indexedField3: Double, features: Vector)
    val rf = TypedRandomForestClassifier[RFInputs]

    val model = rf.fit(indexedData).run()

    // Prediction

    val testData = TypedDataset.create(Seq(
      Data(0D, 10, "foo")
    ))
    val testDataWithFeatures = vectorAssembler.transform(testData).as[DataWithFeatures]()
    val indexedTestData = indexerModel.transform(testDataWithFeatures).as[IndexedDataWithFeatures]()

    case class PredictionInputs(features: Vector, indexedField3: Double)
    val testInput = indexedTestData.project[PredictionInputs]

    case class PredictionResultIndexed(
      features: Vector,
      indexedField3: Double,
      rawPrediction: Vector,
      probability: Vector,
      predictedField3Indexed: Double
    )
    val predictionDs = model.transform(testInput).as[PredictionResultIndexed]()

    case class IndexToStringInput(predictedField3Indexed: Double)
    val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labelsArray.flatten)

    case class PredictionResult(
      features: Vector,
      indexedField3: Double,
      rawPrediction: Vector,
      probability: Vector,
      predictedField3Indexed: Double,
      predictedField3: String
    )
    val stringPredictionDs = indexToString.transform(predictionDs).as[PredictionResult]()

    val prediction = stringPredictionDs.select(stringPredictionDs.col('predictedField3)).collect().run().toList

    prediction mustEqual List("foo")
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/classification/TypedRandomForestClassifierTests.scala
================================================
package frameless
package ml
package classification

import shapeless.test.illTyped
import org.apache.spark.ml.linalg._
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Prop._
import org.scalatest.matchers.must.Matchers

class TypedRandomForestClassifierTests extends FramelessMlSuite with Matchers {
  implicit val arbDouble: Arbitrary[Double] =
    Arbitrary(Gen.choose(1, 99).map(_.toDouble)) // num classes must be between 0 and 100 for the test
  implicit val arbVectorNonEmpty: Arbitrary[Vector] =
    Arbitrary(Generators.arbVector.arbitrary suchThat (_.size > 0)) // vector must not be empty for RandomForestClassifier
  import Generators.arbTreesFeaturesSubsetStrategy

  test("fit() returns a correct TypedTransformer") {
    val prop = forAll { x2: X2[Double, Vector] =>
      val rf = TypedRandomForestClassifier[X2[Double, Vector]]
      val ds = TypedDataset.create(Seq(x2))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X5[Double, Vector, Vector, Vector, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }

    val prop2 = forAll { x2: X2[Vector, Double] =>
      val rf = TypedRandomForestClassifier[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x2))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X5[Vector, Double, Vector, Vector, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }

    def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] =>
      val rf = TypedRandomForestClassifier[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x3))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X6[Vector, Double, A, Vector, Vector, Double]]()

      pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c))
    }

    check(prop)
    check(prop2)
    check(prop3[String])
    check(prop3[Double])
  }

  test("param setting is retained") {
    val prop = forAll { featureSubsetStrategy: FeatureSubsetStrategy =>
      val rf = TypedRandomForestClassifier[X2[Double, Vector]]
        .setNumTrees(10)
        .setMaxBins(100)
        .setFeatureSubsetStrategy(featureSubsetStrategy)
        .setMaxDepth(10)
        .setMaxMemoryInMB(100)
        .setMinInfoGain(0.1D)
        .setMinInstancesPerNode(2)
        .setSubsamplingRate(0.9D)

      val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D))))
      val model = rf.fit(ds).run()

      model.transformer.getNumTrees == 10 &&
        model.transformer.getMaxBins == 100 &&
        model.transformer.getFeatureSubsetStrategy == featureSubsetStrategy.sparkValue &&
        model.transformer.getMaxDepth == 10 &&
        model.transformer.getMaxMemoryInMB == 100 &&
        model.transformer.getMinInfoGain == 0.1D &&
        model.transformer.getMinInstancesPerNode == 2 &&
        model.transformer.getSubsamplingRate == 0.9D
    }

    check(prop)
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedRandomForestClassifier.create[Double]()")
    illTyped("TypedRandomForestClassifier.create[X1[Double]]()")
    illTyped("TypedRandomForestClassifier.create[X2[Double, Double]]()")
    illTyped("TypedRandomForestClassifier.create[X3[Vector, Double, Int]]()")
    illTyped("TypedRandomForestClassifier.create[X2[Vector, String]]()")
  }

}

================================================
FILE: ml/src/test/scala/frameless/ml/clustering/BisectingKMeansTests.scala
================================================
package frameless
package ml
package clustering

import frameless.{TypedDataset, TypedEncoder, X1, X2, X3}
import frameless.ml.classification.TypedBisectingKMeans
import org.scalacheck.Arbitrary
import org.apache.spark.ml.linalg._
import org.scalacheck.Prop._
import frameless.ml._
import org.scalatest.matchers.must.Matchers

class BisectingKMeansTests extends FramelessMlSuite with Matchers {
  implicit val arbVector: Arbitrary[Vector] =
    Arbitrary(Generators.arbVector.arbitrary)

  test("fit() returns a correct TypedTransformer") {
    val prop = forAll { x1: X1[Vector] =>
      val km = TypedBisectingKMeans[X1[Vector]]()
      val ds = TypedDataset.create(Seq(x1))
      val model = km.fit(ds).run()
      val pDs = model.transform(ds).as[X2[Vector, Int]]()

      pDs.select(pDs.col('a)).collect().run().toList == Seq(x1.a)
    }

    def prop3[A: TypedEncoder : Arbitrary] = forAll { x2: X2[Vector, A] =>
      val km = TypedBisectingKMeans[X1[Vector]]()
      val ds = TypedDataset.create(Seq(x2))
      val model = km.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Vector, A, Int]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq((x2.a, x2.b))
    }

    check(prop)
    check(prop3[Double])
  }

  test("param setting is retained") {
    val rf = TypedBisectingKMeans[X1[Vector]]()
      .setK(10)
      .setMaxIter(10)
      .setMinDivisibleClusterSize(1)
      .setSeed(123332)

    val ds = TypedDataset.create(Seq(X2(Vectors.dense(Array(0D)),0)))
    val model = rf.fit(ds).run()

      model.transformer.getK  == 10 &&
      model.transformer.getMaxIter  == 10 &&
      model.transformer.getMinDivisibleClusterSize  == 1 &&
      model.transformer.getSeed == 123332
  }
}


================================================
FILE: ml/src/test/scala/frameless/ml/clustering/ClusteringIntegrationTests.scala
================================================
package frameless
package ml
package clustering

import frameless.ml.FramelessMlSuite
import frameless.ml.classification.{TypedBisectingKMeans, TypedKMeans}
import org.apache.spark.ml.linalg.Vector
import frameless._
import frameless.ml._
import frameless.ml.feature._
import org.scalatest.matchers.must.Matchers

class ClusteringIntegrationTests extends FramelessMlSuite with Matchers {

  test("predict field2 from field1 using a K-means clustering") {
    // Training
    val trainingDataDs = TypedDataset.create(Seq.fill(5)(X2(10D, 0)) :+ X2(100D,0))

    val vectorAssembler = TypedVectorAssembler[X1[Double]]

    val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[X3[Double,Int,Vector]]()

    case class Input(c: Vector)
    val km = TypedKMeans[Input].setK(2)

    val model = km.fit(dataWithFeatures).run()

    // Prediction
    val testSeq = Seq(
      X2(10D, 0),
      X2(100D, 1)
    )

    val testData = TypedDataset.create(testSeq)
    val testDataWithFeatures = vectorAssembler.transform(testData).as[X3[Double,Int,Vector]]()

    val predictionDs = model.transform(testDataWithFeatures).as[X4[Double,Int,Vector,Int]]()

    val prediction = predictionDs.select(predictionDs.col[Int]('d)).collect().run().toList

    prediction mustEqual testSeq.map(_.b)
  }

  test("predict field2 from field1 using a bisecting K-means clustering") {
    // Training
    val trainingDataDs = TypedDataset.create(Seq.fill(5)(X2(10D, 0)) :+ X2(100D,0))

    val vectorAssembler = TypedVectorAssembler[X1[Double]]

    val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[X3[Double, Int, Vector]]()

    case class Inputs(c: Vector)
    val bkm = TypedBisectingKMeans[Inputs]().setK(2)

    val model = bkm.fit(dataWithFeatures).run()

    // Prediction
    val testSeq = Seq(
      X2(10D, 0),
      X2(100D, 1)
    )

    val testData = TypedDataset.create(testSeq)
    val testDataWithFeatures = vectorAssembler.transform(testData).as[X3[Double, Int, Vector]]()

    val predictionDs = model.transform(testDataWithFeatures).as[X4[Double,Int,Vector,Int]]()

    val prediction = predictionDs.select(predictionDs.col[Int]('d)).collect().run().toList

    prediction mustEqual testSeq.map(_.b)
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/clustering/KMeansTests.scala
================================================
package frameless
package ml
package clustering

import frameless.ml.classification.TypedKMeans
import frameless.{TypedDataset, TypedEncoder, X1, X2, X3}
import org.apache.spark.ml.linalg._
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Prop._
import frameless.ml._
import frameless.ml.params.kmeans.KMeansInitMode
import org.scalatest.matchers.must.Matchers

class KMeansTests extends FramelessMlSuite with Matchers {
  implicit val arbVector: Arbitrary[Vector] =
    Arbitrary(Generators.arbVector.arbitrary)
  implicit val arbKMeansInitMode: Arbitrary[KMeansInitMode] =
    Arbitrary {
      Gen.oneOf(
        Gen.const(KMeansInitMode.KMeansPlusPlus),
        Gen.const(KMeansInitMode.Random)
      )
    }

  /**
   * copies a vector as we need two rows of the right dimension for 3.4's alg
   */
  def newRowWithSameDimension(vect: Vector): Vector = {
    val dubs = vect.toArray.map(_ % 2) // k is two
    val dense = Vectors.dense(dubs)
    vect match {
      case _: SparseVector => dense.toSparse
      case _ => dense
    }
  }

  test("fit() returns a correct TypedTransformer") {
    val prop = forAll { x1: X1[Vector] =>
      val x1a = X1(newRowWithSameDimension(x1.a))
      val km = TypedKMeans[X1[Vector]]
      val ds = TypedDataset.create(Seq(x1, x1a))

      val model = km.fit(ds).run()
      val pDs = model.transform(ds).as[X2[Vector, Int]]()

      pDs.select(pDs.col('a)).collect().run().toList == Seq(x1.a, x1a.a)
    }

    def prop3[A: TypedEncoder : Arbitrary] = forAll { x2: X2[Vector, A] =>
      val x2a = x2.copy(a = newRowWithSameDimension(x2.a))
      val km = TypedKMeans[X1[Vector]]
      val ds = TypedDataset.create(Seq(x2, x2a))
      val model = km.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Vector, A, Int]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run().toList == Seq((x2.a, x2.b), (x2a.a, x2a.b))
    }

    tolerantRun( _.isInstanceOf[ArrayIndexOutOfBoundsException] ) {
      check(prop)
      check(prop3[Double])
    }
  }

  test("param setting is retained") {
    val prop = forAll { initMode: KMeansInitMode =>
      val rf = TypedKMeans[X1[Vector]]
        .setInitMode(KMeansInitMode.Random)
        .setInitSteps(2)
        .setK(10)
        .setMaxIter(15)
        .setSeed(123223L)
        .setTol(12D)

      val ds = TypedDataset.create(Seq(X2(Vectors.dense(Array(0D)), 0)))
      val model = rf.fit(ds).run()

      model.transformer.getInitMode == KMeansInitMode.Random.sparkValue &&
        model.transformer.getInitSteps == 2 &&
        model.transformer.getK == 10 &&
        model.transformer.getMaxIter == 15 &&
        model.transformer.getSeed == 123223L &&
        model.transformer.getTol == 12D
    }

    check(prop)
  }
}


================================================
FILE: ml/src/test/scala/frameless/ml/feature/TypedIndexToStringTests.scala
================================================
package frameless
package ml
package feature

import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Prop._
import shapeless.test.illTyped
import org.scalatest.matchers.must.Matchers

class TypedIndexToStringTests extends FramelessMlSuite with Matchers {

  test(".transform() correctly transform an input dataset") {
    implicit val arbDouble = Arbitrary(Gen.choose(0, 99).map(_.toDouble))

    def prop[A: TypedEncoder: Arbitrary] = forAll { x2: X2[Double, A] =>
      val transformer = TypedIndexToString[X1[Double]](Array.fill(100)("foo"))
      val ds = TypedDataset.create(Seq(x2))
      val ds2 = transformer.transform(ds)

      ds2.collect().run() == Seq((x2.a, x2.b, "foo"))
    }

    check(prop[Double])
    check(prop[String])
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedIndexToString.create[String](Array(\"foo\"))")
    illTyped("TypedIndexToString.create[X1[String]](Array(\"foo\"))")
    illTyped("TypedIndexToString.create[X1[Long]](Array(\"foo\"))")
    illTyped("TypedIndexToString.create[X2[String, Int]](Array(\"foo\"))")
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/feature/TypedStringIndexerTests.scala
================================================
package frameless
package ml
package feature

import frameless.ml.feature.TypedStringIndexer.HandleInvalid
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Prop._
import shapeless.test.illTyped
import org.scalatest.matchers.must.Matchers

class TypedStringIndexerTests extends FramelessMlSuite with Matchers {

  test(".fit() returns a correct TypedTransformer") {
    def prop[A: TypedEncoder : Arbitrary] = forAll { x2: X2[String, A] =>
      val indexer = TypedStringIndexer[X1[String]]
      val ds = TypedDataset.create(Seq(x2))
      val model = indexer.fit(ds).run()
      val resultDs = model.transform(ds).as[X3[String, A, Double]]()

      resultDs.collect().run() == Seq(X3(x2.a, x2.b, 0D))
    }

    check(prop[Double])
    check(prop[String])
  }

  test("param setting is retained") {
    implicit val arbHandleInvalid: Arbitrary[HandleInvalid] = Arbitrary {
      Gen.oneOf(HandleInvalid.Keep, HandleInvalid.Error, HandleInvalid.Skip)
    }

    val prop = forAll { handleInvalid: HandleInvalid =>
      val indexer = TypedStringIndexer[X1[String]]
        .setHandleInvalid(handleInvalid)
      val ds = TypedDataset.create(Seq(X1("foo")))
      val model = indexer.fit(ds).run()

      model.transformer.getHandleInvalid == handleInvalid.sparkValue
    }

    check(prop)
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedStringIndexer.create[Double]()")
    illTyped("TypedStringIndexer.create[X1[Double]]()")
    illTyped("TypedStringIndexer.create[X2[String, Long]]()")
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/feature/TypedVectorAssemblerTests.scala
================================================
package frameless
package ml
package feature

import org.scalacheck.Arbitrary
import org.scalacheck.Prop._
import org.apache.spark.ml.linalg._
import shapeless.test.illTyped

class TypedVectorAssemblerTests extends FramelessMlSuite {

  test(".transform() returns a correct TypedTransformer") {
    def prop[A: TypedEncoder: Arbitrary] = forAll { x5: X5[Int, Long, Double, Boolean, A] =>
      val assembler = TypedVectorAssembler[X4[Int, Long, Double, Boolean]]
      val ds = TypedDataset.create(Seq(x5))
      val ds2 = assembler.transform(ds).as[X6[Int, Long, Double, Boolean, A, Vector]]()

      ds2.collect().run() ==
        Seq(X6(x5.a, x5.b, x5.c, x5.d, x5.e, Vectors.dense(x5.a.toDouble, x5.b.toDouble, x5.c, if (x5.d) 1D else 0D)))
    }

    def prop2[A: TypedEncoder: Arbitrary] = forAll { x5: X5[Boolean, BigDecimal, Byte, Short, A] =>
      val assembler = TypedVectorAssembler[X4[Boolean, BigDecimal, Byte, Short]]
      val ds = TypedDataset.create(Seq(x5))
      val ds2 = assembler.transform(ds).as[X6[Boolean, BigDecimal, Byte, Short, A, Vector]]()

      ds2.collect().run() ==
        Seq(X6(x5.a, x5.b, x5.c, x5.d, x5.e, Vectors.dense(if (x5.a) 1D else 0D, x5.b.toDouble, x5.c.toDouble, x5.d.toDouble)))
    }

    check(prop[String])
    check(prop[Double])
    check(prop2[Long])
    check(prop2[Boolean])
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedVectorAssembler.create[Double]()")
    illTyped("TypedVectorAssembler.create[X1[String]]()")
    illTyped("TypedVectorAssembler.create[X2[String, Double]]()")
    illTyped("TypedVectorAssembler.create[X3[Int, String, Double]]()")
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/regression/RegressionIntegrationTests.scala
================================================
package frameless
package ml
package regression

import frameless.ml.feature.TypedVectorAssembler
import org.apache.spark.ml.linalg.Vector
import org.scalatest.matchers.must.Matchers

class RegressionIntegrationTests extends FramelessMlSuite with Matchers {

  test("predict field3 from field1 and field2 using a RandomForestRegressor") {
    case class Data(field1: Double, field2: Int, field3: Double)

    // Training

    val trainingDataDs = TypedDataset.create(Seq.fill(10)(Data(0D, 10, 0D)))

    case class Features(field1: Double, field2: Int)
    val vectorAssembler = TypedVectorAssembler[Features]

    case class DataWithFeatures(field1: Double, field2: Int, field3: Double, features: Vector)
    val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[DataWithFeatures]()

    case class RFInputs(field3: Double, features: Vector)
    val rf = TypedRandomForestRegressor[RFInputs]

    val model = rf.fit(dataWithFeatures).run()

    // Prediction

    val testData = TypedDataset.create(Seq(
      Data(0D, 10, 0D)
    ))
    val testDataWithFeatures = vectorAssembler.transform(testData).as[DataWithFeatures]()

    case class PredictionResult(field1: Double, field2: Int, field3: Double, features: Vector, predictedField3: Double)
    val predictionDs = model.transform(testDataWithFeatures).as[PredictionResult]()

    val prediction = predictionDs.select(predictionDs.col('predictedField3)).collect().run().toList

    prediction mustEqual List(0D)
  }

}


================================================
FILE: ml/src/test/scala/frameless/ml/regression/TypedLinearRegressionTests.scala
================================================
package frameless
package ml
package regression

import frameless.ml.params.linears.{LossStrategy, Solver}
import org.apache.spark.ml.linalg._
import org.scalacheck.Arbitrary
import org.scalacheck.Prop._
import org.scalatest.matchers.should.Matchers
import shapeless.test.illTyped

class TypedLinearRegressionTests extends FramelessMlSuite with Matchers {

  implicit val arbVectorNonEmpty: Arbitrary[Vector] = Arbitrary(Generators.arbVector.arbitrary)

  test("fit() returns a correct TypedTransformer") {
    val prop = forAll { x2: X2[Double, Vector] =>
      val lr = TypedLinearRegression[X2[Double, Vector]]
      val ds = TypedDataset.create(Seq(x2))

      val model = lr.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Double, Vector, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }
    val prop2 = forAll { x2: X2[Vector, Double] =>
      val lr = TypedLinearRegression[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x2))
      val model = lr.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Vector, Double, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }

    def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] =>
      val lr = TypedLinearRegression[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x3))
      val model = lr.fit(ds).run()
      val pDs = model.transform(ds).as[X4[Vector, Double, A, Double]]()

      pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c))
    }

    check(prop)
    check(prop2)
    check(prop3[String])
    check(prop3[Double])
  }

  test("param setting is retained") {
    import Generators.{arbLossStrategy, arbSolver}

    val prop = forAll { (lossStrategy: LossStrategy, solver: Solver) =>
      val lr = TypedLinearRegression[X2[Double, Vector]]
        .setAggregationDepth(10)
        .setEpsilon(4)
        .setFitIntercept(true)
        .setLoss(lossStrategy)
        .setMaxIter(23)
        .setRegParam(1.2)
        .setStandardization(true)
        .setTol(2.3)
        .setSolver(solver)

      val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D))))
      val model = lr.fit(ds).run()

      model.transformer.getAggregationDepth == 10 &&
        model.transformer.getEpsilon == 4.0 &&
        model.transformer.getLoss == lossStrategy.sparkValue &&
        model.transformer.getMaxIter == 23 &&
        model.transformer.getRegParam == 1.2 &&
        model.transformer.getTol == 2.3 &&
        model.transformer.getSolver == solver.sparkValue
    }

    check(prop)
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedLinearRegressor.create[Double]()")
    illTyped("TypedLinearRegressor.create[X1[Double]]()")
    illTyped("TypedLinearRegressor.create[X2[Double, Double]]()")
    illTyped("TypedLinearRegressor.create[X3[Vector, Double, Int]]()")
    illTyped("TypedLinearRegressor.create[X2[Vector, String]]()")
  }

  test("TypedLinearRegressor should fit straight line ") {
    case class Point(features: Vector, labels: Double)

    val ds = Seq(
      X2(new DenseVector(Array(1.0)): Vector, 1.0),
      X2(new DenseVector(Array(2.0)): Vector, 2.0),
      X2(new DenseVector(Array(3.0)): Vector, 3.0),
      X2(new DenseVector(Array(4.0)): Vector, 4.0),
      X2(new DenseVector(Array(5.0)): Vector, 5.0),
      X2(new DenseVector(Array(6.0)): Vector, 6.0)
    )

    val ds2 = Seq(
      X3(new DenseVector(Array(1.0)): Vector,2F, 1.0),
      X3(new DenseVector(Array(2.0)): Vector,2F, 2.0),
      X3(new DenseVector(Array(3.0)): Vector,2F, 3.0),
      X3(new DenseVector(Array(4.0)): Vector,2F, 4.0),
      X3(new DenseVector(Array(5.0)): Vector,2F, 5.0),
      X3(new DenseVector(Array(6.0)): Vector,2F, 6.0)
    )

    val tds = TypedDataset.create(ds)

    val lr = TypedLinearRegression[X2[Vector, Double]]
      .setMaxIter(10)

    val model = lr.fit(tds).run()

    val tds2 = TypedDataset.create(ds2)

    val lr2 = TypedLinearRegression[X3[Vector, Float, Double]]
      .setMaxIter(10)

    val model2 = lr2.fit(tds2).run()

    model.transformer.coefficients shouldEqual new DenseVector(Array(1.0))
    model2.transformer.coefficients shouldEqual new DenseVector(Array(1.0))
  }
}


================================================
FILE: ml/src/test/scala/frameless/ml/regression/TypedRandomForestRegressorTests.scala
================================================
package frameless
package ml
package regression

import frameless.ml.params.trees.FeatureSubsetStrategy
import shapeless.test.illTyped
import org.apache.spark.ml.linalg._
import org.scalacheck.Arbitrary
import org.scalacheck.Prop._
import org.scalatest.matchers.must.Matchers

class TypedRandomForestRegressorTests extends FramelessMlSuite with Matchers {
  implicit val arbVectorNonEmpty: Arbitrary[Vector] =
    Arbitrary(Generators.arbVector.arbitrary suchThat (_.size > 0)) // vector must not be empty for RandomForestRegressor
  import Generators.arbTreesFeaturesSubsetStrategy

  test("fit() returns a correct TypedTransformer") {
    val prop = forAll { x2: X2[Double, Vector] =>
      val rf = TypedRandomForestRegressor[X2[Double, Vector]]
      val ds = TypedDataset.create(Seq(x2))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Double, Vector, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }

    val prop2 = forAll { x2: X2[Vector, Double] =>
      val rf = TypedRandomForestRegressor[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x2))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X3[Vector, Double, Double]]()

      pDs.select(pDs.col('a), pDs.col('b)).collect().run() == Seq(x2.a -> x2.b)
    }

    def prop3[A: TypedEncoder: Arbitrary] = forAll { x3: X3[Vector, Double, A] =>
      val rf = TypedRandomForestRegressor[X2[Vector, Double]]
      val ds = TypedDataset.create(Seq(x3))
      val model = rf.fit(ds).run()
      val pDs = model.transform(ds).as[X4[Vector, Double, A, Double]]()

      pDs.select(pDs.col('a), pDs.col('b), pDs.col('c)).collect().run() == Seq((x3.a, x3.b, x3.c))
    }

    check(prop)
    check(prop2)
    check(prop3[String])
    check(prop3[Double])
  }

  test("param setting is retained") {
    val prop = forAll { featureSubsetStrategy: FeatureSubsetStrategy =>
      val rf = TypedRandomForestRegressor[X2[Double, Vector]]
        .setNumTrees(10)
        .setMaxBins(100)
        .setFeatureSubsetStrategy(featureSubsetStrategy)
        .setMaxDepth(10)
        .setMaxMemoryInMB(100)
        .setMinInfoGain(0.1D)
        .setMinInstancesPerNode(2)
        .setSubsamplingRate(0.9D)

      val ds = TypedDataset.create(Seq(X2(0D, Vectors.dense(0D))))
      val model = rf.fit(ds).run()

      model.transformer.getNumTrees == 10 &&
        model.transformer.getMaxBins == 100 &&
        model.transformer.getFeatureSubsetStrategy == featureSubsetStrategy.sparkValue &&
        model.transformer.getMaxDepth == 10 &&
        model.transformer.getMaxMemoryInMB == 100 &&
        model.transformer.getMinInfoGain == 0.1D &&
        model.transformer.getMinInstancesPerNode == 2 &&
        model.transformer.getSubsamplingRate == 0.9D
    }

    check(prop)
  }

  test("create() compiles only with correct inputs") {
    illTyped("TypedRandomForestRegressor.create[Double]()")
    illTyped("TypedRandomForestRegressor.create[X1[Double]]()")
    illTyped("TypedRandomForestRegressor.create[X2[Double, Double]]()")
    illTyped("TypedRandomForestRegressor.create[X3[Vector, Double, Int]]()")
    illTyped("TypedRandomForestRegressor.create[X2[Vector, String]]()")
  }

}


================================================
FILE: project/Common.scala
================================================
import sbt.Keys._
import sbt._
import sbt.plugins.JvmPlugin

import org.scalafmt.sbt.ScalafmtPlugin.autoImport._

object Common extends AutoPlugin {
  override def trigger = allRequirements
  override def requires = JvmPlugin

  override def projectSettings = Seq(
    scalafmtFilter := "diff-ref=78f708d"
  )
}


================================================
FILE: project/build.properties
================================================
sbt.version=1.12.11


================================================
FILE: project/plugins.sbt
================================================
val sbtTypelevelVersion = "0.8.5"

addSbtPlugin("org.typelevel" % "sbt-typelevel-ci-release" % sbtTypelevelVersion)

addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % sbtTypelevelVersion)

addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.4.4")

addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.6")

addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.4")


================================================
FILE: refined/src/main/scala/frameless/refined/RefinedFieldEncoders.scala
================================================
package frameless.refined

import scala.reflect.ClassTag

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.objects.{
  Invoke, NewInstance, UnwrapOption, WrapOption
}
import org.apache.spark.sql.types._

import eu.timepit.refined.api.RefType

import frameless.{ TypedEncoder, RecordFieldEncoder }

private[refined] trait RefinedFieldEncoders {
  /**
   * @tparam T the refined type (e.g. `String`)
   */
  implicit def optionRefined[F[_, _], T, R](
    implicit
      i0: RefType[F],
      i1: TypedEncoder[T],
      i2: ClassTag[F[T, R]],
  ): RecordFieldEncoder[Option[F[T, R]]] =
    RecordFieldEncoder[Option[F[T, R]]](new TypedEncoder[Option[F[T, R]]] {
      def nullable = true

      // `Refined` is a Value class: https://github.com/fthomas/refined/blob/master/modules/core/shared/src/main/scala-3.0-/eu/timepit/refined/api/Refined.scala#L8
      def jvmRepr = ObjectType(classOf[Option[F[T, R]]])

      def catalystRepr: DataType = i1.catalystRepr

      val innerJvmRepr = ObjectType(i2.runtimeClass)

      def fromCatalyst(path: Expression): Expression = {
        val javaValue = i1.fromCatalyst(path)
        val value = NewInstance(i2.runtimeClass, Seq(javaValue), innerJvmRepr)

        WrapOption(value, innerJvmRepr)
      }

      @inline def toCatalyst(path: Expression): Expression = {
        val value = UnwrapOption(innerJvmRepr, path)

        val javaValue = Invoke(value, "value", i1.jvmRepr, Nil)

        i1.toCatalyst(javaValue)
      }

      override def toString = s"optionRefined[${i2.runtimeClass.getName}]"
    })

  /**
   * @tparam T the refined type (e.g. `String`)
   */
  implicit def refined[F[_, _], T, R](
    implicit
      i0: RefType[F],
      i1: TypedEncoder[T],
      i2: ClassTag[F[T, R]],
  ): RecordFieldEncoder[F[T, R]] =
    RecordFieldEncoder[F[T, R]](new TypedEncoder[F[T, R]] {
      def nullable = i1.nullable

      // `Refined` is a Value class: https://github.com/fthomas/refined/blob/master/modules/core/shared/src/main/scala-3.0-/eu/timepit/refined/api/Refined.scala#L8
      def jvmRepr = i1.jvmRepr

      def catalystRepr: DataType = i1.catalystRepr

      def fromCatalyst(path: Expression): Expression =
        i1.fromCatalyst(path)

      @inline def toCatalyst(path: Expression): Expression =
        i1.toCatalyst(path)

      override def toString = s"refined[${i2.runtimeClass.getName}]"
    })
}


================================================
FILE: refined/src/main/scala/frameless/refined/package.scala
================================================
package frameless

import scala.reflect.ClassTag

import eu.timepit.refined.api.{ RefType, Validate }

package object refined extends RefinedFieldEncoders {
  implicit def refinedInjection[F[_, _], T, R](
    implicit
      refType: RefType[F],
      validate: Validate[T, R]
    ): Injection[F[T, R], T] = Injection(
    refType.unwrap,
    { value =>
      refType.refine[R](value) match {
        case Left(errMsg) =>
          throw new IllegalArgumentException(
            s"Value $value does not satisfy refinement predicate: $errMsg")

        case Right(res) => res
      }
    })

  implicit def refinedEncoder[F[_, _], T, R](
    implicit
      i0: RefType[F],
      i1: Validate[T, R],
      i2: TypedEncoder[T],
      i3: ClassTag[F[T, R]]
    ): TypedEncoder[F[T, R]] = TypedEncoder.usingInjection(
    i3, refinedInjection, i2)
}


================================================
FILE: refined/src/test/scala/frameless/RefinedFieldEncoderTests.scala
================================================
package frameless

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{
  IntegerType, ObjectType, StringType, StructField, StructType
}

import org.scalatest.matchers.should.Matchers

class RefinedFieldEncoderTests extends TypedDatasetSuite with Matchers {
  test("Encode a bare refined type") {
    import eu.timepit.refined.auto._
    import eu.timepit.refined.types.string.NonEmptyString

    val encoder: TypedEncoder[NonEmptyString] = {
      import frameless.refined.refinedEncoder
      TypedEncoder[NonEmptyString]
    }

    val ss = session
    import ss.implicits._

    encoder.catalystRepr shouldBe StringType

    val nes: NonEmptyString = "Non Empty String"

    val unsafeDs = TypedDataset.createUnsafe(sc.parallelize(Seq(nes.value)).toDF())(encoder)

    val expected = Seq(nes)

    unsafeDs.collect().run() shouldBe expected
  }

  test("Encode case class with a refined field") {
    import RefinedTypesTests._

    // Check jvmRepr
    import org.apache.spark.sql.types.ObjectType

    encoderA.jvmRepr shouldBe ObjectType(classOf[A])

    // Check catalystRepr
    val expectedAStructType = StructType(Seq(
      StructField("a", IntegerType, false),
      StructField("s", StringType, false)))

    encoderA.catalystRepr shouldBe expectedAStructType

    // Check unsafe
    val unsafeDs: TypedDataset[A] = {
      val rdd = sc.parallelize(Seq(Row(as.a, as.s.toString)))
      val df = session.createDataFrame(rdd, expectedAStructType)

      TypedDataset.createUnsafe(df)(encoderA)
    }

    val expected = Seq(as)

    unsafeDs.collect().run() shouldBe expected

    // Check safe
    val safeDs = TypedDataset.create(expected)

    safeDs.collect().run() shouldBe expected
  }

  test("Encode case class with a refined optional field") {
    import RefinedTypesTests._

    // Check jvmRepr
    encoderB.jvmRepr shouldBe ObjectType(classOf[B])

    // Check catalystRepr
    val expectedBStructType = StructType(Seq(
      StructField("a", IntegerType, false),
      StructField("s", StringType, true)))

    encoderB.catalystRepr shouldBe expectedBStructType

    // Check unsafe
    val unsafeDs: TypedDataset[B] = {
      val rdd = sc.parallelize(Seq(
        Row(bs.a, bs.s.mkString),
        Row(2, null.asInstanceOf[String]),
      ))

      val df = session.createDataFrame(rdd, expectedBStructType)

      TypedDataset.createUnsafe(df)(encoderB)
    }

    val expected = Seq(bs, B(2, None))

    unsafeDs.collect().run() shouldBe expected

    // Check safe
    val safeDs = TypedDataset.create(expected)

    safeDs.collect().run() shouldBe expected
  }
}

object RefinedTypesTests {
  import eu.timepit.refined.auto._
  import eu.timepit.refined.types.string.NonEmptyString

  case class A(a: Int, s: NonEmptyString)
  case class B(a: Int, s: Option[NonEmptyString])

  val nes: NonEmptyString = "Non Empty String"

  val as = A(-42, nes)
  val bs = B(-42, Option(nes))

  import frameless.refined._ // implicit instances for refined

  implicit val encoderA: TypedEncoder[A] = TypedEncoder.usingDerivation

  implicit val encoderB: TypedEncoder[B] = TypedEncoder.usingDerivation
}


================================================
FILE: scripts/docs-build.sh
================================================
#!/bin/bash

set -eux

sbt copyReadme mdoc

gitbook="node_modules/gitbook-cli/bin/gitbook.js"

if ! test -e $gitbook; then
  npm install gitbook
  npm install gitbook-cli
fi

$gitbook build mdocs/target/mdoc docs/book

mv docs/book/* .

exit 0


================================================
FILE: scripts/docs-publish.sh
================================================
#!/bin/bash

set -eux

# Check that the working directory is a git repository and the repository has no outstanding changes.
git diff-index --quiet HEAD

commit=$(git show -s --format=%h)

git checkout gh-pages

git merge "$commit"

bash scripts/docs-build.sh

git add .

git commit -am "Rebuild documentation ($commit)"

echo "Verify that you didn't break anything:"
echo "  $ python -m SimpleHTTPServer 8000"
echo "  $ xdg-open http://localhost:8000/"
echo ""
echo "Then push to the gh-pages branch:"
echo "  $ git push gh-pages"


================================================
FILE: scripts/travis-publish.sh
================================================
#!/bin/bash

# Taken + modified from typelevel/cats
# https://github.com/typelevel/cats/blob/a8a7587f558541cbabc5c40053181928b4baf78c/scripts/travis-publish.sh

export publish_cmd="publishLocal"

# if [[ $TRAVIS_PULL_REQUEST == "false" && $TRAVIS_BRANCH == "master" && $(cat version.sbt) =~ "-SNAPSHOT" ]]; then
#   export publish_cmd="common/publish cats/publish dataset/publish dataframe/publish"
# fi

sbt_cmd="sbt ++$TRAVIS_SCALA_VERSION -Dfile.encoding=UTF8 -J-XX:ReservedCodeCacheSize=256M"

case "$PHASE" in
  A) 
     docs_cmd="$sbt_cmd doc tut"
     run_cmd="$docs_cmd"
  ;;
  B)
     coverage="$sbt_cmd coverage test && sbt coverageReport && bash <(curl -s https://codecov.io/bash)"
     run_cmd="$coverage"
  ;;
  C) 
     run_cmd="$sbt_cmd clean $publish_cmd"
  ;;   
esac 
eval $run_cmd